// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "ppl/cv/arm/warpaffine.h"
#include "ppl/cv/types.h"
#include "ppl/common/sys.h"
#include "ppl/common/log.h"
#include "common.hpp"
#include "operation_utils.hpp"
#include <arm_neon.h>
#include <limits.h>
#include <algorithm>
#include <cmath>

#define MIN_(A, B) (((A) < (B)) ? (A) : (B))
namespace ppl {
namespace cv {
namespace arm {

template <typename _Tp>
static inline _Tp *alignPtr(_Tp *ptr, int32_t n = (int32_t)sizeof(_Tp))
{
    return (_Tp *)(((size_t)ptr + n - 1) & -n);
}

template <typename T>
inline T *getRowPtr(T *base, int32_t stride, int32_t row)
{
    T *baseRaw = const_cast<T *>(reinterpret_cast<const T *>(base));
    return reinterpret_cast<T *>(baseRaw + row * stride);
}

template <typename T>
inline const T round(const T &a, const T &b)
{
    return a / b * b;
}

template <typename T>
inline const T round_up(const T &a, const T &b)
{
    return (a + b - static_cast<T>(1)) / b * b;
}

const int32_t AB_BITS = 10;
const int32_t AB_SCALE = 1 << AB_BITS;
const int32_t INTER_BITS = 5;
const int32_t INTER_TAB_SIZE = 1 << INTER_BITS;
const int32_t INTER_REMAP_COEF_BITS = 15;
const int32_t INTER_REMAP_COEF_SCALE = 1 << INTER_REMAP_COEF_BITS;

//static int16_t BilinearTab_i[1024][2][2];
static int16_t BilinearTab_i[1024][2][2] = {
    {{32767, 0}, {0, 1}},
    {{31744, 1024}, {0, 0}},
    {{30720, 2048}, {0, 0}},
    {{29696, 3072}, {0, 0}},
    {{28672, 4096}, {0, 0}},
    {{27648, 5120}, {0, 0}},
    {{26624, 6144}, {0, 0}},
    {{25600, 7168}, {0, 0}},
    {{24576, 8192}, {0, 0}},
    {{23552, 9216}, {0, 0}},
    {{22528, 10240}, {0, 0}},
    {{21504, 11264}, {0, 0}},
    {{20480, 12288}, {0, 0}},
    {{19456, 13312}, {0, 0}},
    {{18432, 14336}, {0, 0}},
    {{17408, 15360}, {0, 0}},
    {{16384, 16384}, {0, 0}},
    {{15360, 17408}, {0, 0}},
    {{14336, 18432}, {0, 0}},
    {{13312, 19456}, {0, 0}},
    {{12288, 20480}, {0, 0}},
    {{11264, 21504}, {0, 0}},
    {{10240, 22528}, {0, 0}},
    {{9216, 23552}, {0, 0}},
    {{8192, 24576}, {0, 0}},
    {{7168, 25600}, {0, 0}},
    {{6144, 26624}, {0, 0}},
    {{5120, 27648}, {0, 0}},
    {{4096, 28672}, {0, 0}},
    {{3072, 29696}, {0, 0}},
    {{2048, 30720}, {0, 0}},
    {{1024, 31744}, {0, 0}},
    {{31744, 0}, {1024, 0}},
    {{30752, 992}, {992, 32}},
    {{29760, 1984}, {960, 64}},
    {{28768, 2976}, {928, 96}},
    {{27776, 3968}, {896, 128}},
    {{26784, 4960}, {864, 160}},
    {{25792, 5952}, {832, 192}},
    {{24800, 6944}, {800, 224}},
    {{23808, 7936}, {768, 256}},
    {{22816, 8928}, {736, 288}},
    {{21824, 9920}, {704, 320}},
    {{20832, 10912}, {672, 352}},
    {{19840, 11904}, {640, 384}},
    {{18848, 12896}, {608, 416}},
    {{17856, 13888}, {576, 448}},
    {{16864, 14880}, {544, 480}},
    {{15872, 15872}, {512, 512}},
    {{14880, 16864}, {480, 544}},
    {{13888, 17856}, {448, 576}},
    {{12896, 18848}, {416, 608}},
    {{11904, 19840}, {384, 640}},
    {{10912, 20832}, {352, 672}},
    {{9920, 21824}, {320, 704}},
    {{8928, 22816}, {288, 736}},
    {{7936, 23808}, {256, 768}},
    {{6944, 24800}, {224, 800}},
    {{5952, 25792}, {192, 832}},
    {{4960, 26784}, {160, 864}},
    {{3968, 27776}, {128, 896}},
    {{2976, 28768}, {96, 928}},
    {{1984, 29760}, {64, 960}},
    {{992, 30752}, {32, 992}},
    {{30720, 0}, {2048, 0}},
    {{29760, 960}, {1984, 64}},
    {{28800, 1920}, {1920, 128}},
    {{27840, 2880}, {1856, 192}},
    {{26880, 3840}, {1792, 256}},
    {{25920, 4800}, {1728, 320}},
    {{24960, 5760}, {1664, 384}},
    {{24000, 6720}, {1600, 448}},
    {{23040, 7680}, {1536, 512}},
    {{22080, 8640}, {1472, 576}},
    {{21120, 9600}, {1408, 640}},
    {{20160, 10560}, {1344, 704}},
    {{19200, 11520}, {1280, 768}},
    {{18240, 12480}, {1216, 832}},
    {{17280, 13440}, {1152, 896}},
    {{16320, 14400}, {1088, 960}},
    {{15360, 15360}, {1024, 1024}},
    {{14400, 16320}, {960, 1088}},
    {{13440, 17280}, {896, 1152}},
    {{12480, 18240}, {832, 1216}},
    {{11520, 19200}, {768, 1280}},
    {{10560, 20160}, {704, 1344}},
    {{9600, 21120}, {640, 1408}},
    {{8640, 22080}, {576, 1472}},
    {{7680, 23040}, {512, 1536}},
    {{6720, 24000}, {448, 1600}},
    {{5760, 24960}, {384, 1664}},
    {{4800, 25920}, {320, 1728}},
    {{3840, 26880}, {256, 1792}},
    {{2880, 27840}, {192, 1856}},
    {{1920, 28800}, {128, 1920}},
    {{960, 29760}, {64, 1984}},
    {{29696, 0}, {3072, 0}},
    {{28768, 928}, {2976, 96}},
    {{27840, 1856}, {2880, 192}},
    {{26912, 2784}, {2784, 288}},
    {{25984, 3712}, {2688, 384}},
    {{25056, 4640}, {2592, 480}},
    {{24128, 5568}, {2496, 576}},
    {{23200, 6496}, {2400, 672}},
    {{22272, 7424}, {2304, 768}},
    {{21344, 8352}, {2208, 864}},
    {{20416, 9280}, {2112, 960}},
    {{19488, 10208}, {2016, 1056}},
    {{18560, 11136}, {1920, 1152}},
    {{17632, 12064}, {1824, 1248}},
    {{16704, 12992}, {1728, 1344}},
    {{15776, 13920}, {1632, 1440}},
    {{14848, 14848}, {1536, 1536}},
    {{13920, 15776}, {1440, 1632}},
    {{12992, 16704}, {1344, 1728}},
    {{12064, 17632}, {1248, 1824}},
    {{11136, 18560}, {1152, 1920}},
    {{10208, 19488}, {1056, 2016}},
    {{9280, 20416}, {960, 2112}},
    {{8352, 21344}, {864, 2208}},
    {{7424, 22272}, {768, 2304}},
    {{6496, 23200}, {672, 2400}},
    {{5568, 24128}, {576, 2496}},
    {{4640, 25056}, {480, 2592}},
    {{3712, 25984}, {384, 2688}},
    {{2784, 26912}, {288, 2784}},
    {{1856, 27840}, {192, 2880}},
    {{928, 28768}, {96, 2976}},
    {{28672, 0}, {4096, 0}},
    {{27776, 896}, {3968, 128}},
    {{26880, 1792}, {3840, 256}},
    {{25984, 2688}, {3712, 384}},
    {{25088, 3584}, {3584, 512}},
    {{24192, 4480}, {3456, 640}},
    {{23296, 5376}, {3328, 768}},
    {{22400, 6272}, {3200, 896}},
    {{21504, 7168}, {3072, 1024}},
    {{20608, 8064}, {2944, 1152}},
    {{19712, 8960}, {2816, 1280}},
    {{18816, 9856}, {2688, 1408}},
    {{17920, 10752}, {2560, 1536}},
    {{17024, 11648}, {2432, 1664}},
    {{16128, 12544}, {2304, 1792}},
    {{15232, 13440}, {2176, 1920}},
    {{14336, 14336}, {2048, 2048}},
    {{13440, 15232}, {1920, 2176}},
    {{12544, 16128}, {1792, 2304}},
    {{11648, 17024}, {1664, 2432}},
    {{10752, 17920}, {1536, 2560}},
    {{9856, 18816}, {1408, 2688}},
    {{8960, 19712}, {1280, 2816}},
    {{8064, 20608}, {1152, 2944}},
    {{7168, 21504}, {1024, 3072}},
    {{6272, 22400}, {896, 3200}},
    {{5376, 23296}, {768, 3328}},
    {{4480, 24192}, {640, 3456}},
    {{3584, 25088}, {512, 3584}},
    {{2688, 25984}, {384, 3712}},
    {{1792, 26880}, {256, 3840}},
    {{896, 27776}, {128, 3968}},
    {{27648, 0}, {5120, 0}},
    {{26784, 864}, {4960, 160}},
    {{25920, 1728}, {4800, 320}},
    {{25056, 2592}, {4640, 480}},
    {{24192, 3456}, {4480, 640}},
    {{23328, 4320}, {4320, 800}},
    {{22464, 5184}, {4160, 960}},
    {{21600, 6048}, {4000, 1120}},
    {{20736, 6912}, {3840, 1280}},
    {{19872, 7776}, {3680, 1440}},
    {{19008, 8640}, {3520, 1600}},
    {{18144, 9504}, {3360, 1760}},
    {{17280, 10368}, {3200, 1920}},
    {{16416, 11232}, {3040, 2080}},
    {{15552, 12096}, {2880, 2240}},
    {{14688, 12960}, {2720, 2400}},
    {{13824, 13824}, {2560, 2560}},
    {{12960, 14688}, {2400, 2720}},
    {{12096, 15552}, {2240, 2880}},
    {{11232, 16416}, {2080, 3040}},
    {{10368, 17280}, {1920, 3200}},
    {{9504, 18144}, {1760, 3360}},
    {{8640, 19008}, {1600, 3520}},
    {{7776, 19872}, {1440, 3680}},
    {{6912, 20736}, {1280, 3840}},
    {{6048, 21600}, {1120, 4000}},
    {{5184, 22464}, {960, 4160}},
    {{4320, 23328}, {800, 4320}},
    {{3456, 24192}, {640, 4480}},
    {{2592, 25056}, {480, 4640}},
    {{1728, 25920}, {320, 4800}},
    {{864, 26784}, {160, 4960}},
    {{26624, 0}, {6144, 0}},
    {{25792, 832}, {5952, 192}},
    {{24960, 1664}, {5760, 384}},
    {{24128, 2496}, {5568, 576}},
    {{23296, 3328}, {5376, 768}},
    {{22464, 4160}, {5184, 960}},
    {{21632, 4992}, {4992, 1152}},
    {{20800, 5824}, {4800, 1344}},
    {{19968, 6656}, {4608, 1536}},
    {{19136, 7488}, {4416, 1728}},
    {{18304, 8320}, {4224, 1920}},
    {{17472, 9152}, {4032, 2112}},
    {{16640, 9984}, {3840, 2304}},
    {{15808, 10816}, {3648, 2496}},
    {{14976, 11648}, {3456, 2688}},
    {{14144, 12480}, {3264, 2880}},
    {{13312, 13312}, {3072, 3072}},
    {{12480, 14144}, {2880, 3264}},
    {{11648, 14976}, {2688, 3456}},
    {{10816, 15808}, {2496, 3648}},
    {{9984, 16640}, {2304, 3840}},
    {{9152, 17472}, {2112, 4032}},
    {{8320, 18304}, {1920, 4224}},
    {{7488, 19136}, {1728, 4416}},
    {{6656, 19968}, {1536, 4608}},
    {{5824, 20800}, {1344, 4800}},
    {{4992, 21632}, {1152, 4992}},
    {{4160, 22464}, {960, 5184}},
    {{3328, 23296}, {768, 5376}},
    {{2496, 24128}, {576, 5568}},
    {{1664, 24960}, {384, 5760}},
    {{832, 25792}, {192, 5952}},
    {{25600, 0}, {7168, 0}},
    {{24800, 800}, {6944, 224}},
    {{24000, 1600}, {6720, 448}},
    {{23200, 2400}, {6496, 672}},
    {{22400, 3200}, {6272, 896}},
    {{21600, 4000}, {6048, 1120}},
    {{20800, 4800}, {5824, 1344}},
    {{20000, 5600}, {5600, 1568}},
    {{19200, 6400}, {5376, 1792}},
    {{18400, 7200}, {5152, 2016}},
    {{17600, 8000}, {4928, 2240}},
    {{16800, 8800}, {4704, 2464}},
    {{16000, 9600}, {4480, 2688}},
    {{15200, 10400}, {4256, 2912}},
    {{14400, 11200}, {4032, 3136}},
    {{13600, 12000}, {3808, 3360}},
    {{12800, 12800}, {3584, 3584}},
    {{12000, 13600}, {3360, 3808}},
    {{11200, 14400}, {3136, 4032}},
    {{10400, 15200}, {2912, 4256}},
    {{9600, 16000}, {2688, 4480}},
    {{8800, 16800}, {2464, 4704}},
    {{8000, 17600}, {2240, 4928}},
    {{7200, 18400}, {2016, 5152}},
    {{6400, 19200}, {1792, 5376}},
    {{5600, 20000}, {1568, 5600}},
    {{4800, 20800}, {1344, 5824}},
    {{4000, 21600}, {1120, 6048}},
    {{3200, 22400}, {896, 6272}},
    {{2400, 23200}, {672, 6496}},
    {{1600, 24000}, {448, 6720}},
    {{800, 24800}, {224, 6944}},
    {{24576, 0}, {8192, 0}},
    {{23808, 768}, {7936, 256}},
    {{23040, 1536}, {7680, 512}},
    {{22272, 2304}, {7424, 768}},
    {{21504, 3072}, {7168, 1024}},
    {{20736, 3840}, {6912, 1280}},
    {{19968, 4608}, {6656, 1536}},
    {{19200, 5376}, {6400, 1792}},
    {{18432, 6144}, {6144, 2048}},
    {{17664, 6912}, {5888, 2304}},
    {{16896, 7680}, {5632, 2560}},
    {{16128, 8448}, {5376, 2816}},
    {{15360, 9216}, {5120, 3072}},
    {{14592, 9984}, {4864, 3328}},
    {{13824, 10752}, {4608, 3584}},
    {{13056, 11520}, {4352, 3840}},
    {{12288, 12288}, {4096, 4096}},
    {{11520, 13056}, {3840, 4352}},
    {{10752, 13824}, {3584, 4608}},
    {{9984, 14592}, {3328, 4864}},
    {{9216, 15360}, {3072, 5120}},
    {{8448, 16128}, {2816, 5376}},
    {{7680, 16896}, {2560, 5632}},
    {{6912, 17664}, {2304, 5888}},
    {{6144, 18432}, {2048, 6144}},
    {{5376, 19200}, {1792, 6400}},
    {{4608, 19968}, {1536, 6656}},
    {{3840, 20736}, {1280, 6912}},
    {{3072, 21504}, {1024, 7168}},
    {{2304, 22272}, {768, 7424}},
    {{1536, 23040}, {512, 7680}},
    {{768, 23808}, {256, 7936}},
    {{23552, 0}, {9216, 0}},
    {{22816, 736}, {8928, 288}},
    {{22080, 1472}, {8640, 576}},
    {{21344, 2208}, {8352, 864}},
    {{20608, 2944}, {8064, 1152}},
    {{19872, 3680}, {7776, 1440}},
    {{19136, 4416}, {7488, 1728}},
    {{18400, 5152}, {7200, 2016}},
    {{17664, 5888}, {6912, 2304}},
    {{16928, 6624}, {6624, 2592}},
    {{16192, 7360}, {6336, 2880}},
    {{15456, 8096}, {6048, 3168}},
    {{14720, 8832}, {5760, 3456}},
    {{13984, 9568}, {5472, 3744}},
    {{13248, 10304}, {5184, 4032}},
    {{12512, 11040}, {4896, 4320}},
    {{11776, 11776}, {4608, 4608}},
    {{11040, 12512}, {4320, 4896}},
    {{10304, 13248}, {4032, 5184}},
    {{9568, 13984}, {3744, 5472}},
    {{8832, 14720}, {3456, 5760}},
    {{8096, 15456}, {3168, 6048}},
    {{7360, 16192}, {2880, 6336}},
    {{6624, 16928}, {2592, 6624}},
    {{5888, 17664}, {2304, 6912}},
    {{5152, 18400}, {2016, 7200}},
    {{4416, 19136}, {1728, 7488}},
    {{3680, 19872}, {1440, 7776}},
    {{2944, 20608}, {1152, 8064}},
    {{2208, 21344}, {864, 8352}},
    {{1472, 22080}, {576, 8640}},
    {{736, 22816}, {288, 8928}},
    {{22528, 0}, {10240, 0}},
    {{21824, 704}, {9920, 320}},
    {{21120, 1408}, {9600, 640}},
    {{20416, 2112}, {9280, 960}},
    {{19712, 2816}, {8960, 1280}},
    {{19008, 3520}, {8640, 1600}},
    {{18304, 4224}, {8320, 1920}},
    {{17600, 4928}, {8000, 2240}},
    {{16896, 5632}, {7680, 2560}},
    {{16192, 6336}, {7360, 2880}},
    {{15488, 7040}, {7040, 3200}},
    {{14784, 7744}, {6720, 3520}},
    {{14080, 8448}, {6400, 3840}},
    {{13376, 9152}, {6080, 4160}},
    {{12672, 9856}, {5760, 4480}},
    {{11968, 10560}, {5440, 4800}},
    {{11264, 11264}, {5120, 5120}},
    {{10560, 11968}, {4800, 5440}},
    {{9856, 12672}, {4480, 5760}},
    {{9152, 13376}, {4160, 6080}},
    {{8448, 14080}, {3840, 6400}},
    {{7744, 14784}, {3520, 6720}},
    {{7040, 15488}, {3200, 7040}},
    {{6336, 16192}, {2880, 7360}},
    {{5632, 16896}, {2560, 7680}},
    {{4928, 17600}, {2240, 8000}},
    {{4224, 18304}, {1920, 8320}},
    {{3520, 19008}, {1600, 8640}},
    {{2816, 19712}, {1280, 8960}},
    {{2112, 20416}, {960, 9280}},
    {{1408, 21120}, {640, 9600}},
    {{704, 21824}, {320, 9920}},
    {{21504, 0}, {11264, 0}},
    {{20832, 672}, {10912, 352}},
    {{20160, 1344}, {10560, 704}},
    {{19488, 2016}, {10208, 1056}},
    {{18816, 2688}, {9856, 1408}},
    {{18144, 3360}, {9504, 1760}},
    {{17472, 4032}, {9152, 2112}},
    {{16800, 4704}, {8800, 2464}},
    {{16128, 5376}, {8448, 2816}},
    {{15456, 6048}, {8096, 3168}},
    {{14784, 6720}, {7744, 3520}},
    {{14112, 7392}, {7392, 3872}},
    {{13440, 8064}, {7040, 4224}},
    {{12768, 8736}, {6688, 4576}},
    {{12096, 9408}, {6336, 4928}},
    {{11424, 10080}, {5984, 5280}},
    {{10752, 10752}, {5632, 5632}},
    {{10080, 11424}, {5280, 5984}},
    {{9408, 12096}, {4928, 6336}},
    {{8736, 12768}, {4576, 6688}},
    {{8064, 13440}, {4224, 7040}},
    {{7392, 14112}, {3872, 7392}},
    {{6720, 14784}, {3520, 7744}},
    {{6048, 15456}, {3168, 8096}},
    {{5376, 16128}, {2816, 8448}},
    {{4704, 16800}, {2464, 8800}},
    {{4032, 17472}, {2112, 9152}},
    {{3360, 18144}, {1760, 9504}},
    {{2688, 18816}, {1408, 9856}},
    {{2016, 19488}, {1056, 10208}},
    {{1344, 20160}, {704, 10560}},
    {{672, 20832}, {352, 10912}},
    {{20480, 0}, {12288, 0}},
    {{19840, 640}, {11904, 384}},
    {{19200, 1280}, {11520, 768}},
    {{18560, 1920}, {11136, 1152}},
    {{17920, 2560}, {10752, 1536}},
    {{17280, 3200}, {10368, 1920}},
    {{16640, 3840}, {9984, 2304}},
    {{16000, 4480}, {9600, 2688}},
    {{15360, 5120}, {9216, 3072}},
    {{14720, 5760}, {8832, 3456}},
    {{14080, 6400}, {8448, 3840}},
    {{13440, 7040}, {8064, 4224}},
    {{12800, 7680}, {7680, 4608}},
    {{12160, 8320}, {7296, 4992}},
    {{11520, 8960}, {6912, 5376}},
    {{10880, 9600}, {6528, 5760}},
    {{10240, 10240}, {6144, 6144}},
    {{9600, 10880}, {5760, 6528}},
    {{8960, 11520}, {5376, 6912}},
    {{8320, 12160}, {4992, 7296}},
    {{7680, 12800}, {4608, 7680}},
    {{7040, 13440}, {4224, 8064}},
    {{6400, 14080}, {3840, 8448}},
    {{5760, 14720}, {3456, 8832}},
    {{5120, 15360}, {3072, 9216}},
    {{4480, 16000}, {2688, 9600}},
    {{3840, 16640}, {2304, 9984}},
    {{3200, 17280}, {1920, 10368}},
    {{2560, 17920}, {1536, 10752}},
    {{1920, 18560}, {1152, 11136}},
    {{1280, 19200}, {768, 11520}},
    {{640, 19840}, {384, 11904}},
    {{19456, 0}, {13312, 0}},
    {{18848, 608}, {12896, 416}},
    {{18240, 1216}, {12480, 832}},
    {{17632, 1824}, {12064, 1248}},
    {{17024, 2432}, {11648, 1664}},
    {{16416, 3040}, {11232, 2080}},
    {{15808, 3648}, {10816, 2496}},
    {{15200, 4256}, {10400, 2912}},
    {{14592, 4864}, {9984, 3328}},
    {{13984, 5472}, {9568, 3744}},
    {{13376, 6080}, {9152, 4160}},
    {{12768, 6688}, {8736, 4576}},
    {{12160, 7296}, {8320, 4992}},
    {{11552, 7904}, {7904, 5408}},
    {{10944, 8512}, {7488, 5824}},
    {{10336, 9120}, {7072, 6240}},
    {{9728, 9728}, {6656, 6656}},
    {{9120, 10336}, {6240, 7072}},
    {{8512, 10944}, {5824, 7488}},
    {{7904, 11552}, {5408, 7904}},
    {{7296, 12160}, {4992, 8320}},
    {{6688, 12768}, {4576, 8736}},
    {{6080, 13376}, {4160, 9152}},
    {{5472, 13984}, {3744, 9568}},
    {{4864, 14592}, {3328, 9984}},
    {{4256, 15200}, {2912, 10400}},
    {{3648, 15808}, {2496, 10816}},
    {{3040, 16416}, {2080, 11232}},
    {{2432, 17024}, {1664, 11648}},
    {{1824, 17632}, {1248, 12064}},
    {{1216, 18240}, {832, 12480}},
    {{608, 18848}, {416, 12896}},
    {{18432, 0}, {14336, 0}},
    {{17856, 576}, {13888, 448}},
    {{17280, 1152}, {13440, 896}},
    {{16704, 1728}, {12992, 1344}},
    {{16128, 2304}, {12544, 1792}},
    {{15552, 2880}, {12096, 2240}},
    {{14976, 3456}, {11648, 2688}},
    {{14400, 4032}, {11200, 3136}},
    {{13824, 4608}, {10752, 3584}},
    {{13248, 5184}, {10304, 4032}},
    {{12672, 5760}, {9856, 4480}},
    {{12096, 6336}, {9408, 4928}},
    {{11520, 6912}, {8960, 5376}},
    {{10944, 7488}, {8512, 5824}},
    {{10368, 8064}, {8064, 6272}},
    {{9792, 8640}, {7616, 6720}},
    {{9216, 9216}, {7168, 7168}},
    {{8640, 9792}, {6720, 7616}},
    {{8064, 10368}, {6272, 8064}},
    {{7488, 10944}, {5824, 8512}},
    {{6912, 11520}, {5376, 8960}},
    {{6336, 12096}, {4928, 9408}},
    {{5760, 12672}, {4480, 9856}},
    {{5184, 13248}, {4032, 10304}},
    {{4608, 13824}, {3584, 10752}},
    {{4032, 14400}, {3136, 11200}},
    {{3456, 14976}, {2688, 11648}},
    {{2880, 15552}, {2240, 12096}},
    {{2304, 16128}, {1792, 12544}},
    {{1728, 16704}, {1344, 12992}},
    {{1152, 17280}, {896, 13440}},
    {{576, 17856}, {448, 13888}},
    {{17408, 0}, {15360, 0}},
    {{16864, 544}, {14880, 480}},
    {{16320, 1088}, {14400, 960}},
    {{15776, 1632}, {13920, 1440}},
    {{15232, 2176}, {13440, 1920}},
    {{14688, 2720}, {12960, 2400}},
    {{14144, 3264}, {12480, 2880}},
    {{13600, 3808}, {12000, 3360}},
    {{13056, 4352}, {11520, 3840}},
    {{12512, 4896}, {11040, 4320}},
    {{11968, 5440}, {10560, 4800}},
    {{11424, 5984}, {10080, 5280}},
    {{10880, 6528}, {9600, 5760}},
    {{10336, 7072}, {9120, 6240}},
    {{9792, 7616}, {8640, 6720}},
    {{9248, 8160}, {8160, 7200}},
    {{8704, 8704}, {7680, 7680}},
    {{8160, 9248}, {7200, 8160}},
    {{7616, 9792}, {6720, 8640}},
    {{7072, 10336}, {6240, 9120}},
    {{6528, 10880}, {5760, 9600}},
    {{5984, 11424}, {5280, 10080}},
    {{5440, 11968}, {4800, 10560}},
    {{4896, 12512}, {4320, 11040}},
    {{4352, 13056}, {3840, 11520}},
    {{3808, 13600}, {3360, 12000}},
    {{3264, 14144}, {2880, 12480}},
    {{2720, 14688}, {2400, 12960}},
    {{2176, 15232}, {1920, 13440}},
    {{1632, 15776}, {1440, 13920}},
    {{1088, 16320}, {960, 14400}},
    {{544, 16864}, {480, 14880}},
    {{16384, 0}, {16384, 0}},
    {{15872, 512}, {15872, 512}},
    {{15360, 1024}, {15360, 1024}},
    {{14848, 1536}, {14848, 1536}},
    {{14336, 2048}, {14336, 2048}},
    {{13824, 2560}, {13824, 2560}},
    {{13312, 3072}, {13312, 3072}},
    {{12800, 3584}, {12800, 3584}},
    {{12288, 4096}, {12288, 4096}},
    {{11776, 4608}, {11776, 4608}},
    {{11264, 5120}, {11264, 5120}},
    {{10752, 5632}, {10752, 5632}},
    {{10240, 6144}, {10240, 6144}},
    {{9728, 6656}, {9728, 6656}},
    {{9216, 7168}, {9216, 7168}},
    {{8704, 7680}, {8704, 7680}},
    {{8192, 8192}, {8192, 8192}},
    {{7680, 8704}, {7680, 8704}},
    {{7168, 9216}, {7168, 9216}},
    {{6656, 9728}, {6656, 9728}},
    {{6144, 10240}, {6144, 10240}},
    {{5632, 10752}, {5632, 10752}},
    {{5120, 11264}, {5120, 11264}},
    {{4608, 11776}, {4608, 11776}},
    {{4096, 12288}, {4096, 12288}},
    {{3584, 12800}, {3584, 12800}},
    {{3072, 13312}, {3072, 13312}},
    {{2560, 13824}, {2560, 13824}},
    {{2048, 14336}, {2048, 14336}},
    {{1536, 14848}, {1536, 14848}},
    {{1024, 15360}, {1024, 15360}},
    {{512, 15872}, {512, 15872}},
    {{15360, 0}, {17408, 0}},
    {{14880, 480}, {16864, 544}},
    {{14400, 960}, {16320, 1088}},
    {{13920, 1440}, {15776, 1632}},
    {{13440, 1920}, {15232, 2176}},
    {{12960, 2400}, {14688, 2720}},
    {{12480, 2880}, {14144, 3264}},
    {{12000, 3360}, {13600, 3808}},
    {{11520, 3840}, {13056, 4352}},
    {{11040, 4320}, {12512, 4896}},
    {{10560, 4800}, {11968, 5440}},
    {{10080, 5280}, {11424, 5984}},
    {{9600, 5760}, {10880, 6528}},
    {{9120, 6240}, {10336, 7072}},
    {{8640, 6720}, {9792, 7616}},
    {{8160, 7200}, {9248, 8160}},
    {{7680, 7680}, {8704, 8704}},
    {{7200, 8160}, {8160, 9248}},
    {{6720, 8640}, {7616, 9792}},
    {{6240, 9120}, {7072, 10336}},
    {{5760, 9600}, {6528, 10880}},
    {{5280, 10080}, {5984, 11424}},
    {{4800, 10560}, {5440, 11968}},
    {{4320, 11040}, {4896, 12512}},
    {{3840, 11520}, {4352, 13056}},
    {{3360, 12000}, {3808, 13600}},
    {{2880, 12480}, {3264, 14144}},
    {{2400, 12960}, {2720, 14688}},
    {{1920, 13440}, {2176, 15232}},
    {{1440, 13920}, {1632, 15776}},
    {{960, 14400}, {1088, 16320}},
    {{480, 14880}, {544, 16864}},
    {{14336, 0}, {18432, 0}},
    {{13888, 448}, {17856, 576}},
    {{13440, 896}, {17280, 1152}},
    {{12992, 1344}, {16704, 1728}},
    {{12544, 1792}, {16128, 2304}},
    {{12096, 2240}, {15552, 2880}},
    {{11648, 2688}, {14976, 3456}},
    {{11200, 3136}, {14400, 4032}},
    {{10752, 3584}, {13824, 4608}},
    {{10304, 4032}, {13248, 5184}},
    {{9856, 4480}, {12672, 5760}},
    {{9408, 4928}, {12096, 6336}},
    {{8960, 5376}, {11520, 6912}},
    {{8512, 5824}, {10944, 7488}},
    {{8064, 6272}, {10368, 8064}},
    {{7616, 6720}, {9792, 8640}},
    {{7168, 7168}, {9216, 9216}},
    {{6720, 7616}, {8640, 9792}},
    {{6272, 8064}, {8064, 10368}},
    {{5824, 8512}, {7488, 10944}},
    {{5376, 8960}, {6912, 11520}},
    {{4928, 9408}, {6336, 12096}},
    {{4480, 9856}, {5760, 12672}},
    {{4032, 10304}, {5184, 13248}},
    {{3584, 10752}, {4608, 13824}},
    {{3136, 11200}, {4032, 14400}},
    {{2688, 11648}, {3456, 14976}},
    {{2240, 12096}, {2880, 15552}},
    {{1792, 12544}, {2304, 16128}},
    {{1344, 12992}, {1728, 16704}},
    {{896, 13440}, {1152, 17280}},
    {{448, 13888}, {576, 17856}},
    {{13312, 0}, {19456, 0}},
    {{12896, 416}, {18848, 608}},
    {{12480, 832}, {18240, 1216}},
    {{12064, 1248}, {17632, 1824}},
    {{11648, 1664}, {17024, 2432}},
    {{11232, 2080}, {16416, 3040}},
    {{10816, 2496}, {15808, 3648}},
    {{10400, 2912}, {15200, 4256}},
    {{9984, 3328}, {14592, 4864}},
    {{9568, 3744}, {13984, 5472}},
    {{9152, 4160}, {13376, 6080}},
    {{8736, 4576}, {12768, 6688}},
    {{8320, 4992}, {12160, 7296}},
    {{7904, 5408}, {11552, 7904}},
    {{7488, 5824}, {10944, 8512}},
    {{7072, 6240}, {10336, 9120}},
    {{6656, 6656}, {9728, 9728}},
    {{6240, 7072}, {9120, 10336}},
    {{5824, 7488}, {8512, 10944}},
    {{5408, 7904}, {7904, 11552}},
    {{4992, 8320}, {7296, 12160}},
    {{4576, 8736}, {6688, 12768}},
    {{4160, 9152}, {6080, 13376}},
    {{3744, 9568}, {5472, 13984}},
    {{3328, 9984}, {4864, 14592}},
    {{2912, 10400}, {4256, 15200}},
    {{2496, 10816}, {3648, 15808}},
    {{2080, 11232}, {3040, 16416}},
    {{1664, 11648}, {2432, 17024}},
    {{1248, 12064}, {1824, 17632}},
    {{832, 12480}, {1216, 18240}},
    {{416, 12896}, {608, 18848}},
    {{12288, 0}, {20480, 0}},
    {{11904, 384}, {19840, 640}},
    {{11520, 768}, {19200, 1280}},
    {{11136, 1152}, {18560, 1920}},
    {{10752, 1536}, {17920, 2560}},
    {{10368, 1920}, {17280, 3200}},
    {{9984, 2304}, {16640, 3840}},
    {{9600, 2688}, {16000, 4480}},
    {{9216, 3072}, {15360, 5120}},
    {{8832, 3456}, {14720, 5760}},
    {{8448, 3840}, {14080, 6400}},
    {{8064, 4224}, {13440, 7040}},
    {{7680, 4608}, {12800, 7680}},
    {{7296, 4992}, {12160, 8320}},
    {{6912, 5376}, {11520, 8960}},
    {{6528, 5760}, {10880, 9600}},
    {{6144, 6144}, {10240, 10240}},
    {{5760, 6528}, {9600, 10880}},
    {{5376, 6912}, {8960, 11520}},
    {{4992, 7296}, {8320, 12160}},
    {{4608, 7680}, {7680, 12800}},
    {{4224, 8064}, {7040, 13440}},
    {{3840, 8448}, {6400, 14080}},
    {{3456, 8832}, {5760, 14720}},
    {{3072, 9216}, {5120, 15360}},
    {{2688, 9600}, {4480, 16000}},
    {{2304, 9984}, {3840, 16640}},
    {{1920, 10368}, {3200, 17280}},
    {{1536, 10752}, {2560, 17920}},
    {{1152, 11136}, {1920, 18560}},
    {{768, 11520}, {1280, 19200}},
    {{384, 11904}, {640, 19840}},
    {{11264, 0}, {21504, 0}},
    {{10912, 352}, {20832, 672}},
    {{10560, 704}, {20160, 1344}},
    {{10208, 1056}, {19488, 2016}},
    {{9856, 1408}, {18816, 2688}},
    {{9504, 1760}, {18144, 3360}},
    {{9152, 2112}, {17472, 4032}},
    {{8800, 2464}, {16800, 4704}},
    {{8448, 2816}, {16128, 5376}},
    {{8096, 3168}, {15456, 6048}},
    {{7744, 3520}, {14784, 6720}},
    {{7392, 3872}, {14112, 7392}},
    {{7040, 4224}, {13440, 8064}},
    {{6688, 4576}, {12768, 8736}},
    {{6336, 4928}, {12096, 9408}},
    {{5984, 5280}, {11424, 10080}},
    {{5632, 5632}, {10752, 10752}},
    {{5280, 5984}, {10080, 11424}},
    {{4928, 6336}, {9408, 12096}},
    {{4576, 6688}, {8736, 12768}},
    {{4224, 7040}, {8064, 13440}},
    {{3872, 7392}, {7392, 14112}},
    {{3520, 7744}, {6720, 14784}},
    {{3168, 8096}, {6048, 15456}},
    {{2816, 8448}, {5376, 16128}},
    {{2464, 8800}, {4704, 16800}},
    {{2112, 9152}, {4032, 17472}},
    {{1760, 9504}, {3360, 18144}},
    {{1408, 9856}, {2688, 18816}},
    {{1056, 10208}, {2016, 19488}},
    {{704, 10560}, {1344, 20160}},
    {{352, 10912}, {672, 20832}},
    {{10240, 0}, {22528, 0}},
    {{9920, 320}, {21824, 704}},
    {{9600, 640}, {21120, 1408}},
    {{9280, 960}, {20416, 2112}},
    {{8960, 1280}, {19712, 2816}},
    {{8640, 1600}, {19008, 3520}},
    {{8320, 1920}, {18304, 4224}},
    {{8000, 2240}, {17600, 4928}},
    {{7680, 2560}, {16896, 5632}},
    {{7360, 2880}, {16192, 6336}},
    {{7040, 3200}, {15488, 7040}},
    {{6720, 3520}, {14784, 7744}},
    {{6400, 3840}, {14080, 8448}},
    {{6080, 4160}, {13376, 9152}},
    {{5760, 4480}, {12672, 9856}},
    {{5440, 4800}, {11968, 10560}},
    {{5120, 5120}, {11264, 11264}},
    {{4800, 5440}, {10560, 11968}},
    {{4480, 5760}, {9856, 12672}},
    {{4160, 6080}, {9152, 13376}},
    {{3840, 6400}, {8448, 14080}},
    {{3520, 6720}, {7744, 14784}},
    {{3200, 7040}, {7040, 15488}},
    {{2880, 7360}, {6336, 16192}},
    {{2560, 7680}, {5632, 16896}},
    {{2240, 8000}, {4928, 17600}},
    {{1920, 8320}, {4224, 18304}},
    {{1600, 8640}, {3520, 19008}},
    {{1280, 8960}, {2816, 19712}},
    {{960, 9280}, {2112, 20416}},
    {{640, 9600}, {1408, 21120}},
    {{320, 9920}, {704, 21824}},
    {{9216, 0}, {23552, 0}},
    {{8928, 288}, {22816, 736}},
    {{8640, 576}, {22080, 1472}},
    {{8352, 864}, {21344, 2208}},
    {{8064, 1152}, {20608, 2944}},
    {{7776, 1440}, {19872, 3680}},
    {{7488, 1728}, {19136, 4416}},
    {{7200, 2016}, {18400, 5152}},
    {{6912, 2304}, {17664, 5888}},
    {{6624, 2592}, {16928, 6624}},
    {{6336, 2880}, {16192, 7360}},
    {{6048, 3168}, {15456, 8096}},
    {{5760, 3456}, {14720, 8832}},
    {{5472, 3744}, {13984, 9568}},
    {{5184, 4032}, {13248, 10304}},
    {{4896, 4320}, {12512, 11040}},
    {{4608, 4608}, {11776, 11776}},
    {{4320, 4896}, {11040, 12512}},
    {{4032, 5184}, {10304, 13248}},
    {{3744, 5472}, {9568, 13984}},
    {{3456, 5760}, {8832, 14720}},
    {{3168, 6048}, {8096, 15456}},
    {{2880, 6336}, {7360, 16192}},
    {{2592, 6624}, {6624, 16928}},
    {{2304, 6912}, {5888, 17664}},
    {{2016, 7200}, {5152, 18400}},
    {{1728, 7488}, {4416, 19136}},
    {{1440, 7776}, {3680, 19872}},
    {{1152, 8064}, {2944, 20608}},
    {{864, 8352}, {2208, 21344}},
    {{576, 8640}, {1472, 22080}},
    {{288, 8928}, {736, 22816}},
    {{8192, 0}, {24576, 0}},
    {{7936, 256}, {23808, 768}},
    {{7680, 512}, {23040, 1536}},
    {{7424, 768}, {22272, 2304}},
    {{7168, 1024}, {21504, 3072}},
    {{6912, 1280}, {20736, 3840}},
    {{6656, 1536}, {19968, 4608}},
    {{6400, 1792}, {19200, 5376}},
    {{6144, 2048}, {18432, 6144}},
    {{5888, 2304}, {17664, 6912}},
    {{5632, 2560}, {16896, 7680}},
    {{5376, 2816}, {16128, 8448}},
    {{5120, 3072}, {15360, 9216}},
    {{4864, 3328}, {14592, 9984}},
    {{4608, 3584}, {13824, 10752}},
    {{4352, 3840}, {13056, 11520}},
    {{4096, 4096}, {12288, 12288}},
    {{3840, 4352}, {11520, 13056}},
    {{3584, 4608}, {10752, 13824}},
    {{3328, 4864}, {9984, 14592}},
    {{3072, 5120}, {9216, 15360}},
    {{2816, 5376}, {8448, 16128}},
    {{2560, 5632}, {7680, 16896}},
    {{2304, 5888}, {6912, 17664}},
    {{2048, 6144}, {6144, 18432}},
    {{1792, 6400}, {5376, 19200}},
    {{1536, 6656}, {4608, 19968}},
    {{1280, 6912}, {3840, 20736}},
    {{1024, 7168}, {3072, 21504}},
    {{768, 7424}, {2304, 22272}},
    {{512, 7680}, {1536, 23040}},
    {{256, 7936}, {768, 23808}},
    {{7168, 0}, {25600, 0}},
    {{6944, 224}, {24800, 800}},
    {{6720, 448}, {24000, 1600}},
    {{6496, 672}, {23200, 2400}},
    {{6272, 896}, {22400, 3200}},
    {{6048, 1120}, {21600, 4000}},
    {{5824, 1344}, {20800, 4800}},
    {{5600, 1568}, {20000, 5600}},
    {{5376, 1792}, {19200, 6400}},
    {{5152, 2016}, {18400, 7200}},
    {{4928, 2240}, {17600, 8000}},
    {{4704, 2464}, {16800, 8800}},
    {{4480, 2688}, {16000, 9600}},
    {{4256, 2912}, {15200, 10400}},
    {{4032, 3136}, {14400, 11200}},
    {{3808, 3360}, {13600, 12000}},
    {{3584, 3584}, {12800, 12800}},
    {{3360, 3808}, {12000, 13600}},
    {{3136, 4032}, {11200, 14400}},
    {{2912, 4256}, {10400, 15200}},
    {{2688, 4480}, {9600, 16000}},
    {{2464, 4704}, {8800, 16800}},
    {{2240, 4928}, {8000, 17600}},
    {{2016, 5152}, {7200, 18400}},
    {{1792, 5376}, {6400, 19200}},
    {{1568, 5600}, {5600, 20000}},
    {{1344, 5824}, {4800, 20800}},
    {{1120, 6048}, {4000, 21600}},
    {{896, 6272}, {3200, 22400}},
    {{672, 6496}, {2400, 23200}},
    {{448, 6720}, {1600, 24000}},
    {{224, 6944}, {800, 24800}},
    {{6144, 0}, {26624, 0}},
    {{5952, 192}, {25792, 832}},
    {{5760, 384}, {24960, 1664}},
    {{5568, 576}, {24128, 2496}},
    {{5376, 768}, {23296, 3328}},
    {{5184, 960}, {22464, 4160}},
    {{4992, 1152}, {21632, 4992}},
    {{4800, 1344}, {20800, 5824}},
    {{4608, 1536}, {19968, 6656}},
    {{4416, 1728}, {19136, 7488}},
    {{4224, 1920}, {18304, 8320}},
    {{4032, 2112}, {17472, 9152}},
    {{3840, 2304}, {16640, 9984}},
    {{3648, 2496}, {15808, 10816}},
    {{3456, 2688}, {14976, 11648}},
    {{3264, 2880}, {14144, 12480}},
    {{3072, 3072}, {13312, 13312}},
    {{2880, 3264}, {12480, 14144}},
    {{2688, 3456}, {11648, 14976}},
    {{2496, 3648}, {10816, 15808}},
    {{2304, 3840}, {9984, 16640}},
    {{2112, 4032}, {9152, 17472}},
    {{1920, 4224}, {8320, 18304}},
    {{1728, 4416}, {7488, 19136}},
    {{1536, 4608}, {6656, 19968}},
    {{1344, 4800}, {5824, 20800}},
    {{1152, 4992}, {4992, 21632}},
    {{960, 5184}, {4160, 22464}},
    {{768, 5376}, {3328, 23296}},
    {{576, 5568}, {2496, 24128}},
    {{384, 5760}, {1664, 24960}},
    {{192, 5952}, {832, 25792}},
    {{5120, 0}, {27648, 0}},
    {{4960, 160}, {26784, 864}},
    {{4800, 320}, {25920, 1728}},
    {{4640, 480}, {25056, 2592}},
    {{4480, 640}, {24192, 3456}},
    {{4320, 800}, {23328, 4320}},
    {{4160, 960}, {22464, 5184}},
    {{4000, 1120}, {21600, 6048}},
    {{3840, 1280}, {20736, 6912}},
    {{3680, 1440}, {19872, 7776}},
    {{3520, 1600}, {19008, 8640}},
    {{3360, 1760}, {18144, 9504}},
    {{3200, 1920}, {17280, 10368}},
    {{3040, 2080}, {16416, 11232}},
    {{2880, 2240}, {15552, 12096}},
    {{2720, 2400}, {14688, 12960}},
    {{2560, 2560}, {13824, 13824}},
    {{2400, 2720}, {12960, 14688}},
    {{2240, 2880}, {12096, 15552}},
    {{2080, 3040}, {11232, 16416}},
    {{1920, 3200}, {10368, 17280}},
    {{1760, 3360}, {9504, 18144}},
    {{1600, 3520}, {8640, 19008}},
    {{1440, 3680}, {7776, 19872}},
    {{1280, 3840}, {6912, 20736}},
    {{1120, 4000}, {6048, 21600}},
    {{960, 4160}, {5184, 22464}},
    {{800, 4320}, {4320, 23328}},
    {{640, 4480}, {3456, 24192}},
    {{480, 4640}, {2592, 25056}},
    {{320, 4800}, {1728, 25920}},
    {{160, 4960}, {864, 26784}},
    {{4096, 0}, {28672, 0}},
    {{3968, 128}, {27776, 896}},
    {{3840, 256}, {26880, 1792}},
    {{3712, 384}, {25984, 2688}},
    {{3584, 512}, {25088, 3584}},
    {{3456, 640}, {24192, 4480}},
    {{3328, 768}, {23296, 5376}},
    {{3200, 896}, {22400, 6272}},
    {{3072, 1024}, {21504, 7168}},
    {{2944, 1152}, {20608, 8064}},
    {{2816, 1280}, {19712, 8960}},
    {{2688, 1408}, {18816, 9856}},
    {{2560, 1536}, {17920, 10752}},
    {{2432, 1664}, {17024, 11648}},
    {{2304, 1792}, {16128, 12544}},
    {{2176, 1920}, {15232, 13440}},
    {{2048, 2048}, {14336, 14336}},
    {{1920, 2176}, {13440, 15232}},
    {{1792, 2304}, {12544, 16128}},
    {{1664, 2432}, {11648, 17024}},
    {{1536, 2560}, {10752, 17920}},
    {{1408, 2688}, {9856, 18816}},
    {{1280, 2816}, {8960, 19712}},
    {{1152, 2944}, {8064, 20608}},
    {{1024, 3072}, {7168, 21504}},
    {{896, 3200}, {6272, 22400}},
    {{768, 3328}, {5376, 23296}},
    {{640, 3456}, {4480, 24192}},
    {{512, 3584}, {3584, 25088}},
    {{384, 3712}, {2688, 25984}},
    {{256, 3840}, {1792, 26880}},
    {{128, 3968}, {896, 27776}},
    {{3072, 0}, {29696, 0}},
    {{2976, 96}, {28768, 928}},
    {{2880, 192}, {27840, 1856}},
    {{2784, 288}, {26912, 2784}},
    {{2688, 384}, {25984, 3712}},
    {{2592, 480}, {25056, 4640}},
    {{2496, 576}, {24128, 5568}},
    {{2400, 672}, {23200, 6496}},
    {{2304, 768}, {22272, 7424}},
    {{2208, 864}, {21344, 8352}},
    {{2112, 960}, {20416, 9280}},
    {{2016, 1056}, {19488, 10208}},
    {{1920, 1152}, {18560, 11136}},
    {{1824, 1248}, {17632, 12064}},
    {{1728, 1344}, {16704, 12992}},
    {{1632, 1440}, {15776, 13920}},
    {{1536, 1536}, {14848, 14848}},
    {{1440, 1632}, {13920, 15776}},
    {{1344, 1728}, {12992, 16704}},
    {{1248, 1824}, {12064, 17632}},
    {{1152, 1920}, {11136, 18560}},
    {{1056, 2016}, {10208, 19488}},
    {{960, 2112}, {9280, 20416}},
    {{864, 2208}, {8352, 21344}},
    {{768, 2304}, {7424, 22272}},
    {{672, 2400}, {6496, 23200}},
    {{576, 2496}, {5568, 24128}},
    {{480, 2592}, {4640, 25056}},
    {{384, 2688}, {3712, 25984}},
    {{288, 2784}, {2784, 26912}},
    {{192, 2880}, {1856, 27840}},
    {{96, 2976}, {928, 28768}},
    {{2048, 0}, {30720, 0}},
    {{1984, 64}, {29760, 960}},
    {{1920, 128}, {28800, 1920}},
    {{1856, 192}, {27840, 2880}},
    {{1792, 256}, {26880, 3840}},
    {{1728, 320}, {25920, 4800}},
    {{1664, 384}, {24960, 5760}},
    {{1600, 448}, {24000, 6720}},
    {{1536, 512}, {23040, 7680}},
    {{1472, 576}, {22080, 8640}},
    {{1408, 640}, {21120, 9600}},
    {{1344, 704}, {20160, 10560}},
    {{1280, 768}, {19200, 11520}},
    {{1216, 832}, {18240, 12480}},
    {{1152, 896}, {17280, 13440}},
    {{1088, 960}, {16320, 14400}},
    {{1024, 1024}, {15360, 15360}},
    {{960, 1088}, {14400, 16320}},
    {{896, 1152}, {13440, 17280}},
    {{832, 1216}, {12480, 18240}},
    {{768, 1280}, {11520, 19200}},
    {{704, 1344}, {10560, 20160}},
    {{640, 1408}, {9600, 21120}},
    {{576, 1472}, {8640, 22080}},
    {{512, 1536}, {7680, 23040}},
    {{448, 1600}, {6720, 24000}},
    {{384, 1664}, {5760, 24960}},
    {{320, 1728}, {4800, 25920}},
    {{256, 1792}, {3840, 26880}},
    {{192, 1856}, {2880, 27840}},
    {{128, 1920}, {1920, 28800}},
    {{64, 1984}, {960, 29760}},
    {{1024, 0}, {31744, 0}},
    {{992, 32}, {30752, 992}},
    {{960, 64}, {29760, 1984}},
    {{928, 96}, {28768, 2976}},
    {{896, 128}, {27776, 3968}},
    {{864, 160}, {26784, 4960}},
    {{832, 192}, {25792, 5952}},
    {{800, 224}, {24800, 6944}},
    {{768, 256}, {23808, 7936}},
    {{736, 288}, {22816, 8928}},
    {{704, 320}, {21824, 9920}},
    {{672, 352}, {20832, 10912}},
    {{640, 384}, {19840, 11904}},
    {{608, 416}, {18848, 12896}},
    {{576, 448}, {17856, 13888}},
    {{544, 480}, {16864, 14880}},
    {{512, 512}, {15872, 15872}},
    {{480, 544}, {14880, 16864}},
    {{448, 576}, {13888, 17856}},
    {{416, 608}, {12896, 18848}},
    {{384, 640}, {11904, 19840}},
    {{352, 672}, {10912, 20832}},
    {{320, 704}, {9920, 21824}},
    {{288, 736}, {8928, 22816}},
    {{256, 768}, {7936, 23808}},
    {{224, 800}, {6944, 24800}},
    {{192, 832}, {5952, 25792}},
    {{160, 864}, {4960, 26784}},
    {{128, 896}, {3968, 27776}},
    {{96, 928}, {2976, 28768}},
    {{64, 960}, {1984, 29760}},
    {{32, 992}, {992, 30752}}};

template <typename T>
inline T clip(T value, T min_value, T max_value)
{
    return HPC::utils::min(HPC::utils::max(value, min_value), max_value);
}

static inline void interpolateLinear(float x, float *coeffs)
{
    coeffs[0] = 1.f - x;
    coeffs[1] = x;
}

template <typename T, int32_t cn>
void warpAffine_nearest(
    T *dst,
    const T *src,
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    const float *M,
    int32_t borderMode,
    float borderValue = 0.0f)
{
    const int32_t BLOCK_SIZE = 32;
    int32_t _map[BLOCK_SIZE * BLOCK_SIZE + 32];
    int32_t *map = alignPtr(_map, 16);

    int32_t round_delta = AB_SCALE >> 1;

    float32x4_t v_m1 = vdupq_n_f32(M[1]);
    float32x4_t v_m2 = vdupq_n_f32(M[2]);
    float32x4_t v_m4 = vdupq_n_f32(M[4]);
    float32x4_t v_m5 = vdupq_n_f32(M[5]);

    int32_t *adelta = (int32_t *)malloc(outWidth * sizeof(int32_t));
    int32_t *bdelta = (int32_t *)malloc(outWidth * sizeof(int32_t));
    for (int32_t x = 0; x < outWidth; ++x) {
        adelta[x] = rint(M[0] * x * AB_SCALE);
        bdelta[x] = rint(M[3] * x * AB_SCALE);
    }

    if (borderMode == BORDER_REPLICATE) {
        int32x4_t v_zero4 = vdupq_n_s32(0);
        int32x4_t max_width = vdupq_n_s32(inWidth - 1);
        int32x4_t max_height = vdupq_n_s32(inHeight - 1);
        int32x4_t v_cn = vdupq_n_s32(cn);
        int32x4_t v_inWidthStride = vdupq_n_s32(inWidthStride);
        int32x4_t v_round_delta = vdupq_n_s32(round_delta);
        float32x4_t v_AB_SCALE = vdupq_n_f32(AB_SCALE);
        for (int32_t i = 0; i < outHeight; i += BLOCK_SIZE) {
            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, outHeight - i);
            for (int32_t j = 0; j < outWidth; j += BLOCK_SIZE) {
                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, outWidth - j);

                // compute table
                for (size_t y = 0; y < blockHeight; ++y) {
                    int32_t *map_row = getRowPtr(&map[0], blockWidth, y);
                    size_t x = 0, dsty = y + i;
                    float32x4_t v_y = vdupq_n_f32(dsty);
                    float32x4_t v_yx = vmlaq_f32(v_m2, v_m1, v_y), v_yy = vmlaq_f32(v_m5, v_m4, v_y);

                    for (; x + 4 <= blockWidth; x += 4) {
                        int32_t dstx = x + j;
                        int32x4_t X0 = vaddq_s32(vcvtq_s32_f32(vmulq_f32(v_yx, v_AB_SCALE)), v_round_delta);
                        int32x4_t Y0 = vaddq_s32(vcvtq_s32_f32(vmulq_f32(v_yy, v_AB_SCALE)), v_round_delta);
                        int32x4_t srcX = vshrq_n_s32(vaddq_s32(X0, vld1q_s32(adelta + dstx)), AB_BITS);
                        int32x4_t srcY = vshrq_n_s32(vaddq_s32(Y0, vld1q_s32(bdelta + dstx)), AB_BITS);
                        srcX = vminq_s32(vmaxq_s32(srcX, v_zero4), max_width);
                        srcY = vminq_s32(vmaxq_s32(srcY, v_zero4), max_height);
                        int32x4_t v_src_index = vmlaq_s32(vmulq_s32(srcY, v_inWidthStride), srcX, v_cn);
                        vst1q_s32(map_row + x, v_src_index);
                    }

                    for (; x < blockWidth; ++x) {
                        int32_t dstx = x + j;
                        int32_t X0 = rint((M[1] * dsty + M[2]) * AB_SCALE) + round_delta;
                        int32_t Y0 = rint((M[4] * dsty + M[5]) * AB_SCALE) + round_delta;
                        int32_t srcX = (X0 + adelta[dstx]) >> AB_BITS;
                        int32_t srcY = (Y0 + bdelta[dstx]) >> AB_BITS;
                        srcX = clip(srcX, 0, inWidth - 1);
                        srcY = clip(srcY, 0, inHeight - 1);
                        map_row[x] = srcY * inWidthStride + srcX * cn;
                    }
                }
                for (size_t y = 0; y < blockHeight; ++y) {
                    const int32_t *map_row = getRowPtr(map, blockWidth, y);
                    T *dst_row = getRowPtr(dst, outWidthStride, i + y) + j * cn;

                    for (size_t x = 0; x < blockWidth; x++) {
                        int32_t tmp = x * cn;
                        for (int32_t k = 0; k < cn; ++k) {
                            dst_row[tmp + k] = src[map_row[x] + k];
                        }
                    }
                }
            }
        }
    } else if (borderMode == BORDER_CONSTANT) {
        int32x4_t v_nega = vdupq_n_s32(-1);
        int32x4_t max_width = vdupq_n_s32(inWidth - 1);
        int32x4_t max_height = vdupq_n_s32(inHeight - 1);
        int32x4_t v_cn = vdupq_n_s32(cn);
        int32x4_t v_inWidthStride = vdupq_n_s32(inWidthStride);
        int32x4_t v_round_delta = vdupq_n_s32(round_delta);
        float32x4_t v_AB_SCALE = vdupq_n_f32(AB_SCALE);
        for (int32_t i = 0; i < outHeight; i += BLOCK_SIZE) {
            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, outHeight - i);
            for (int32_t j = 0; j < outWidth; j += BLOCK_SIZE) {
                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, outWidth - j);

                // compute table
                for (size_t y = 0; y < blockHeight; ++y) {
                    int32_t *map_row = getRowPtr(&map[0], blockWidth, y);
                    size_t x = 0, dsty = y + i;
                    float32x4_t v_y = vdupq_n_f32(dsty);
                    float32x4_t v_yx = vmlaq_f32(v_m2, v_m1, v_y), v_yy = vmlaq_f32(v_m5, v_m4, v_y);

                    for (; x + 4 <= blockWidth; x += 4) {
                        int32_t dstx = x + j;
                        int32x4_t X0 = vaddq_s32(vcvtq_s32_f32(vmulq_f32(v_yx, v_AB_SCALE)), v_round_delta);
                        int32x4_t Y0 = vaddq_s32(vcvtq_s32_f32(vmulq_f32(v_yy, v_AB_SCALE)), v_round_delta);
                        int32x4_t srcX = vshrq_n_s32(vaddq_s32(X0, vld1q_s32(adelta + dstx)), AB_BITS);
                        int32x4_t srcY = vshrq_n_s32(vaddq_s32(Y0, vld1q_s32(bdelta + dstx)), AB_BITS);
                        uint32x4_t flg0 = vcleq_u32(vreinterpretq_u32_s32(srcX), vreinterpretq_u32_s32(max_width));
                        uint32x4_t flg1 = vcleq_u32(vreinterpretq_u32_s32(srcY), vreinterpretq_u32_s32(max_height));
                        flg0 = vandq_u32(flg0, flg1);
                        int32x4_t v_src_index = vmlaq_s32(vmulq_s32(srcY, v_inWidthStride), srcX, v_cn);
                        v_src_index = vbslq_s32(flg0, v_src_index, v_nega);
                        vst1q_s32(map_row + x, v_src_index);
                    }

                    for (; x < blockWidth; ++x) {
                        int32_t dstx = x + j;
                        int32_t X0 = rint((M[1] * dsty + M[2]) * AB_SCALE) + round_delta;
                        int32_t Y0 = rint((M[4] * dsty + M[5]) * AB_SCALE) + round_delta;
                        int32_t srcX = (X0 + adelta[dstx]) >> AB_BITS;
                        int32_t srcY = (Y0 + bdelta[dstx]) >> AB_BITS;
                        if ((unsigned)(srcX - 0) <= (unsigned)(inWidth - 1 - 0) && (unsigned)(srcY - 0) <= (unsigned)(inHeight - 1 - 0))
                            map_row[x] = srcY * inWidthStride + srcX * cn;
                        else
                            map_row[x] = -1;
                    }
                }
                for (size_t y = 0; y < blockHeight; ++y) {
                    const int32_t *map_row = getRowPtr(map, blockWidth, y);
                    T *dst_row = getRowPtr(dst, outWidthStride, i + y) + j * cn;

                    for (size_t x = 0; x < blockWidth; x++) {
                        int32_t tmp = x * cn;
                        for (int32_t k = 0; k < cn; ++k) {
                            dst_row[tmp + k] = map_row[x] >= 0 ? src[map_row[x] + k] : borderValue;
                        }
                    }
                }
            }
        }
    } else if (borderMode == BORDER_TRANSPARENT) {
        int32x4_t v_nega = vdupq_n_s32(-1);
        int32x4_t max_width = vdupq_n_s32(inWidth - 1);
        int32x4_t max_height = vdupq_n_s32(inHeight - 1);
        int32x4_t v_cn = vdupq_n_s32(cn);
        int32x4_t v_inWidthStride = vdupq_n_s32(inWidthStride);
        int32x4_t v_round_delta = vdupq_n_s32(round_delta);
        float32x4_t v_AB_SCALE = vdupq_n_f32(AB_SCALE);
        for (int32_t i = 0; i < outHeight; i += BLOCK_SIZE) {
            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, outHeight - i);
            for (int32_t j = 0; j < outWidth; j += BLOCK_SIZE) {
                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, outWidth - j);

                // compute table
                for (size_t y = 0; y < blockHeight; ++y) {
                    int32_t *map_row = getRowPtr(&map[0], blockWidth, y);
                    size_t x = 0, dsty = y + i;
                    float32x4_t v_y = vdupq_n_f32(dsty);
                    float32x4_t v_yx = vmlaq_f32(v_m2, v_m1, v_y), v_yy = vmlaq_f32(v_m5, v_m4, v_y);
                    for (; x + 4 <= blockWidth; x += 4) {
                        int32_t dstx = x + j;
                        int32x4_t X0 = vaddq_s32(vcvtq_s32_f32(vmulq_f32(v_yx, v_AB_SCALE)), v_round_delta);
                        int32x4_t Y0 = vaddq_s32(vcvtq_s32_f32(vmulq_f32(v_yy, v_AB_SCALE)), v_round_delta);
                        int32x4_t srcX = vshrq_n_s32(vaddq_s32(X0, vld1q_s32(adelta + dstx)), AB_BITS);
                        int32x4_t srcY = vshrq_n_s32(vaddq_s32(Y0, vld1q_s32(bdelta + dstx)), AB_BITS);
                        uint32x4_t flg0 = vcleq_u32(vreinterpretq_u32_s32(srcX), vreinterpretq_u32_s32(max_width));
                        uint32x4_t flg1 = vcleq_u32(vreinterpretq_u32_s32(srcY), vreinterpretq_u32_s32(max_height));
                        flg0 = vandq_u32(flg0, flg1);
                        int32x4_t v_src_index = vmlaq_s32(vmulq_s32(srcY, v_inWidthStride), srcX, v_cn);
                        v_src_index = vbslq_s32(flg0, v_src_index, v_nega);
                        vst1q_s32(map_row + x, v_src_index);
                    }

                    for (; x < blockWidth; ++x) {
                        int32_t dstx = x + j;
                        int32_t X0 = rint((M[1] * dsty + M[2]) * AB_SCALE) + round_delta;
                        int32_t Y0 = rint((M[4] * dsty + M[5]) * AB_SCALE) + round_delta;
                        int32_t srcX = (X0 + adelta[dstx]) >> AB_BITS;
                        int32_t srcY = (Y0 + bdelta[dstx]) >> AB_BITS;
                        if ((unsigned)(srcX - 0) <= (unsigned)(inWidth - 1 - 0) && (unsigned)(srcY - 0) <= (unsigned)(inHeight - 1 - 0))
                            map_row[x] = srcY * inWidthStride + srcX * cn;
                        else
                            map_row[x] = -1;
                    }
                }
                for (size_t y = 0; y < blockHeight; ++y) {
                    const int32_t *map_row = getRowPtr(map, blockWidth, y);
                    T *dst_row = getRowPtr(dst, outWidthStride, i + y) + j * cn;

                    for (size_t x = 0; x < blockWidth; x++) {
                        if (map_row[x] < 0)
                            continue;
                        int32_t tmp = x * cn;
                        for (int32_t k = 0; k < cn; ++k) {
                            dst_row[tmp + k] = src[map_row[x] + k];
                        }
                    }
                }
            }
        }
    }
}

template <typename T>
void warpaffine_linear(
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    T *dst,
    const T *src,
    const float *M,
    T delta,
    int32_t nc,
    ppl::cv::BorderType borderMode)
{
    for (int32_t i = 0; i < outHeight; i++) {
        float base_x = M[1] * i + M[2];
        float base_y = M[4] * i + M[5];
        for (int32_t j = 0; j < outWidth; j++) {
            float x = base_x + M[0] * j;
            float y = base_y + M[3] * j;

            int32_t sx0 = (int32_t)x;
            int32_t sy0 = (int32_t)y;
            float u = x - sx0;
            float v = y - sy0;

            float tab[4];
            float taby[2], tabx[2];
            float v0, v1, v2, v3;
            taby[0] = 1.0f - 1.0f * v;
            taby[1] = v;
            tabx[0] = 1.0f - u;
            tabx[1] = u;

            tab[0] = taby[0] * tabx[0];
            tab[1] = taby[0] * tabx[1];
            tab[2] = taby[1] * tabx[0];
            tab[3] = taby[1] * tabx[1];

            int32_t idxDst = (i * outWidthStride + j * nc);

            if (borderMode == ppl::cv::BORDER_CONSTANT) {
                bool flag0 = (sx0 >= 0 && sx0 < inWidth && sy0 >= 0 && sy0 < inHeight);
                bool flag1 = (sx0 + 1 >= 0 && sx0 + 1 < inWidth && sy0 >= 0 && sy0 < inHeight);
                bool flag2 = (sx0 >= 0 && sx0 < inWidth && sy0 + 1 >= 0 && sy0 + 1 < inHeight);
                bool flag3 = (sx0 + 1 >= 0 && sx0 + 1 < inWidth && sy0 + 1 >= 0 && sy0 + 1 < inHeight);
                for (int32_t k = 0; k < nc; k++) {
                    int32_t position1 = (sy0 * inWidthStride + sx0 * nc);
                    int32_t position2 = ((sy0 + 1) * inWidthStride + sx0 * nc);
                    v0 = flag0 ? src[position1 + k] : delta;
                    v1 = flag1 ? src[position1 + nc + k] : delta;
                    v2 = flag2 ? src[position2 + k] : delta;
                    v3 = flag3 ? src[position2 + nc + k] : delta;
                    float sum = 0;
                    sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
                    dst[idxDst + k] = static_cast<T>(sum);
                }
            } else if (borderMode == ppl::cv::BORDER_REPLICATE) {
                int32_t sx1 = sx0 + 1;
                int32_t sy1 = sy0 + 1;
                sx0 = clip(sx0, 0, inWidth - 1);
                sx1 = clip(sx1, 0, inWidth - 1);
                sy0 = clip(sy0, 0, inHeight - 1);
                sy1 = clip(sy1, 0, inHeight - 1);
                const T *t0 = src + sy0 * inWidthStride + sx0 * nc;
                const T *t1 = src + sy0 * inWidthStride + sx1 * nc;
                const T *t2 = src + sy1 * inWidthStride + sx0 * nc;
                const T *t3 = src + sy1 * inWidthStride + sx1 * nc;
                for (int32_t k = 0; k < nc; ++k) {
                    float sum = 0;
                    sum += t0[k] * tab[0] + t1[k] * tab[1] + t2[k] * tab[2] + t3[k] * tab[3];
                    dst[idxDst + k] = static_cast<T>(sum);
                }
            } else if (borderMode == ppl::cv::BORDER_TRANSPARENT) {
                bool flag0 = (sx0 >= 0 && sx0 < inWidth && sy0 >= 0 && sy0 < inHeight);
                bool flag1 = (sx0 + 1 >= 0 && sx0 + 1 < inWidth && sy0 >= 0 && sy0 < inHeight);
                bool flag2 = (sx0 >= 0 && sx0 < inWidth && sy0 + 1 >= 0 && sy0 + 1 < inHeight);
                bool flag3 = (sx0 + 1 >= 0 && sx0 + 1 < inWidth && sy0 + 1 >= 0 && sy0 + 1 < inHeight);
                if (flag0 && flag1 && flag2 && flag3) {
                    for (int32_t k = 0; k < nc; k++) {
                        int32_t position1 = (sy0 * inWidthStride + sx0 * nc);
                        int32_t position2 = ((sy0 + 1) * inWidthStride + sx0 * nc);
                        v0 = src[position1 + k];
                        v1 = src[position1 + nc + k];
                        v2 = src[position2 + k];
                        v3 = src[position2 + nc + k];
                        float sum = 0;
                        sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
                        dst[idxDst + k] = static_cast<T>(sum);
                    }
                } else {
                    continue;
                }
            }
        }
    }
}

void warpAffine_linear_uint8_t(
    uint8_t *dst,
    const uint8_t *src,
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    const float *M,
    int32_t cn,
    int32_t borderMode,
    uint8_t borderValue = 0)
{
    int32_t *buffer = nullptr;
    int32_t ret = posix_memalign((void **)(&buffer), 32, (outHeight + outWidth) * 2 * sizeof(int32_t));
    if (ret) {
        return;
    }
    int32_t *adelta = buffer;
    int32_t *bdelta = buffer + outWidth * 2;

    int32_t *ptra = adelta;
    int32_t *ptrb = bdelta;

    for (int32_t x = 0; x < outWidth; ++x) {
        *ptra++ = static_cast<int32_t>(M[0] * x * 1024);
        *ptra++ = static_cast<int32_t>(M[3] * x * 1024);
    }

    for (int32_t y = 0; y < outHeight; ++y) {
        *ptrb++ = static_cast<int32_t>((M[1] * y + M[2]) * 1024);
        *ptrb++ = static_cast<int32_t>((M[4] * y + M[5]) * 1024);
    }

    int32_t DELTA = 1 << 14;

    if (cn == 1) {
        uint32_t *buf_loc = new uint32_t[outWidth];
        int16_t *tab_loc = new int16_t[outWidth];

        uint16_t *buf_point = (uint16_t *)buf_loc;
        const uint8_t *src2 = src + inWidthStride;

        for (int32_t y = 0; y < outHeight; ++y) {
            int32_t x_count = 0;
            int32_t end_x = 0;
            int32_t final_loc_base = y * outWidthStride;
            for (int32_t x = 0; x < outWidth; ++x) {
                int32_t final_loc = final_loc_base + x;
                int32_t new_x = adelta[2 * x] + bdelta[2 * y] + 16;
                int32_t new_y = adelta[2 * x + 1] + bdelta[2 * y + 1] + 16;
                int32_t new_x_full = new_x >> 5;
                int32_t new_y_full = new_y >> 5;
                int32_t new_x_loc = new_x >> 10;
                int32_t new_y_loc = new_y >> 10;

                int16_t new_xy_float = (new_x_full & 31) + (new_y_full & 31) * 32;
                int16_t *wtab = BilinearTab_i[new_xy_float][0];
                int32_t loc_base = new_y_loc * inWidthStride + new_x_loc;

                if (((new_x_loc >= 0) && (new_x_loc < inWidth - 1) && (new_y_loc >= 0) && (new_y_loc < inHeight - 1))) {
                    uint16_t *ptr = (uint16_t *)(src + loc_base);
                    uint16_t *ptr2 = (uint16_t *)(src2 + loc_base);
                    buf_point[2 * x] = ptr[0];
                    buf_point[2 * x + 1] = ptr2[0];
                    tab_loc[x] = new_xy_float;
                    x_count++;
                    end_x = x;
                } else {
                    if (borderMode == BORDER_CONSTANT) {
                        int32_t mask0 = new_x_loc >= 0 &&
                                        new_x_loc <= (inWidth - 1) &&
                                        new_y_loc >= 0 &&
                                        new_y_loc <= (inHeight - 1);

                        int32_t mask1 = new_x_loc >= -1 &&
                                        new_x_loc <= (inWidth - 2) &&
                                        new_y_loc >= 0 &&
                                        new_y_loc <= (inHeight - 1);

                        int32_t mask2 = new_x_loc >= 0 &&
                                        new_x_loc <= (inWidth - 1) &&
                                        new_y_loc >= -1 &&
                                        new_y_loc <= (inHeight - 2);

                        int32_t mask3 = new_x_loc >= -1 &&
                                        new_x_loc <= (inWidth - 2) &&
                                        new_y_loc >= -1 &&
                                        new_y_loc <= (inHeight - 2);

                        int32_t val_xy0 = 0;
                        if (mask0) {
                            val_xy0 += wtab[0] * src[loc_base];
                        } else {
                            val_xy0 += wtab[0] * borderValue;
                        }
                        if (mask1) {
                            val_xy0 += wtab[1] * src[loc_base + 1];
                        } else {
                            val_xy0 += wtab[1] * borderValue;
                        }
                        if (mask2) {
                            val_xy0 += wtab[2] * src2[loc_base];
                        } else {
                            val_xy0 += wtab[2] * borderValue;
                        }
                        if (mask3) {
                            val_xy0 += wtab[3] * src2[loc_base + 1];
                        } else {
                            val_xy0 += wtab[3] * borderValue;
                        }

                        dst[final_loc] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
                    } else if (borderMode == BORDER_TRANSPARENT) {
                        continue;
                    } else if (borderMode == BORDER_REPLICATE) {
                        int32_t sx0 = clip(new_x_loc, 0, inWidth - 1);
                        int32_t sy0 = clip(new_y_loc, 0, inHeight - 1);
                        int32_t sx1 = clip((new_x_loc + 1), 0, inWidth - 1);
                        int32_t sy1 = clip((new_y_loc + 1), 0, inHeight - 1);

                        int32_t val_xy0 =
                            src[sy0 * inWidthStride + sx0] * wtab[0] +
                            src[sy0 * inWidthStride + sx1] * wtab[1] +
                            src[sy1 * inWidthStride + sx0] * wtab[2] +
                            src[sy1 * inWidthStride + sx1] * wtab[3];

                        dst[final_loc] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
                    }
                }
            }
            int32_t x = end_x - x_count + 1;
            uint8_t *ptr = (uint8_t *)(buf_loc + x);

            int32x4_t DELTA_vec = vdupq_n_s32(DELTA);
            uint8_t *dst_loc = dst + final_loc_base + x;

            int16_t *BilinearTab_ptr = BilinearTab_i[0][0];
            int32_t simd_loop = x_count >> 3;

            if (simd_loop > 0) {
                asm volatile(
                    "subs x12, %7, #1\n\t"
                    "blt 1f\n\t"
                    "#load from tab_loc\n\t"
                    "add x13, %4, %5, lsl #1\n\t"
                    "ldrsh x14, [x13]\n\t"
                    "ldrsh x9, [x13, #2]\n\t"
                    "ldrsh x10, [x13, #4]\n\t"
                    "ldrsh x11, [x13, #6]\n\t"
                    "#load from ptr\n\t"
                    "ld1 {v0.4s}, [%0], #16\n\t"
                    "add x14, %3, x14, lsl #3\n\t"
                    "add x9, %3, x9, lsl #3\n\t"
                    "add x10, %3, x10, lsl #3\n\t"
                    "add x11, %3, x11, lsl #3\n\t"
                    "0:\n\t"
                    "ld1 {v1.4s}, [%0], #16\n\t"
                    "ins v2.s[0], v0.s[1]\n\t"
                    "ins v3.s[0], v0.s[2]\n\t"
                    "ins v4.s[0], v0.s[3]\n\t"
                    "#load from BilinearTab\n\t"
                    "ld1 {v8.4h}, [x14]\n\t"
                    "ld1 {v9.4h}, [x9]\n\t"
                    "ld1 {v10.4h}, [x10]\n\t"
                    "ld1 {v11.4h}, [x11]\n\t"
                    "ldrsh x14, [x13, #8]\n\t"
                    "ldrsh x9, [x13, #10]\n\t"
                    "ldrsh x10, [x13, #12]\n\t"
                    "ldrsh x11, [x13, #14]\n\t"
                    "add x13, x13, #16\n\t"
                    "#start calculation\n\t"
                    "ushll v0.8h, v0.8b, #0\n\t"
                    "ushll v2.8h, v2.8b, #0\n\t"
                    "ushll v3.8h, v3.8b, #0\n\t"
                    "ushll v4.8h, v4.8b, #0\n\t"
                    "ins v5.s[0], v1.s[1]\n\t"
                    "ins v6.s[0], v1.s[2]\n\t"
                    "ins v7.s[0], v1.s[3]\n\t"
                    "add x14, %3, x14, lsl #3\n\t"
                    "add x9, %3, x9, lsl #3\n\t"
                    "add x10, %3, x10, lsl #3\n\t"
                    "add x11, %3, x11, lsl #3\n\t"
                    "smull v0.4s, v0.4h, v8.4h\n\t"
                    "smull v2.4s, v2.4h, v9.4h\n\t"
                    "smull v3.4s, v3.4h, v10.4h\n\t"
                    "smull v4.4s, v4.4h, v11.4h\n\t"
                    "ld1 {v8.4h}, [x14]\n\t"
                    "ld1 {v9.4h}, [x9]\n\t"
                    "ld1 {v10.4h}, [x10]\n\t"
                    "ld1 {v11.4h}, [x11]\n\t"
                    "ushll v1.8h, v1.8b, #0\n\t"
                    "ushll v5.8h, v5.8b, #0\n\t"
                    "ushll v6.8h, v6.8b, #0\n\t"
                    "ushll v7.8h, v7.8b, #0\n\t"
                    "addp v0.4s, v0.4s, v2.4s\n\t"
                    "addp v3.4s, v3.4s, v4.4s\n\t"
                    "smull v1.4s, v1.4h, v8.4h\n\t"
                    "smull v5.4s, v5.4h, v9.4h\n\t"
                    "smull v6.4s, v6.4h, v10.4h\n\t"
                    "smull v7.4s, v7.4h, v11.4h\n\t"
                    "addp v0.4s, v0.4s, v3.4s\n\t"
                    "ldrsh x14, [x13]\n\t"
                    "ldrsh x9, [x13, #2]\n\t"
                    "ldrsh x10, [x13, #4]\n\t"
                    "ldrsh x11, [x13, #6]\n\t"
                    "addp v1.4s, v1.4s, v5.4s\n\t"
                    "addp v6.4s, v6.4s, v7.4s\n\t"
                    "add v2.4s, v0.4s, %6.4s\n\t"
                    "ld1 {v0.4s}, [%0], #16\n\t"
                    "add x14, %3, x14, lsl #3\n\t"
                    "add x9, %3, x9, lsl #3\n\t"
                    "add x10, %3, x10, lsl #3\n\t"
                    "add x11, %3, x11, lsl #3\n\t"
                    "addp v1.4s, v1.4s, v6.4s\n\t"
                    "shrn v2.4h, v2.4s, #15\n\t"
                    "add v1.4s, v1.4s, %6.4s\n\t"
                    "subs x12, x12, #1\n\t"
                    "shrn v1.4h, v1.4s, #15\n\t"
                    "ins v2.d[1], v1.d[0]\n\t"
                    "sqxtun v2.8b, v2.8h\n\t"
                    "st1 {v2.8b}, [%2], #8\n\t"
                    "bge 0b\n\t"
                    "sub %0, %0, #16\n\t"
                    "1:\n\t"
                    : "=r"(ptr)
                    : "0"(ptr), "r"(dst_loc), "r"(BilinearTab_ptr), "r"(tab_loc), "r"(x), "w"(DELTA_vec), "r"(simd_loop)
                    : "cc", "memory", "x13", "x14", "x9", "x10", "x11", "x12", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
                x = x + (simd_loop << 3);
            }

            for (; x <= end_x; ++x) {
                int32_t final_loc = final_loc_base + x;
                int16_t *wtab = BilinearTab_i[tab_loc[x]][0];
                int32_t point0 = ptr[0];
                int32_t point1 = ptr[1];
                int32_t point2 = ptr[2];
                int32_t point3 = ptr[3];
                ptr += 4;

                int32_t val_xy0 = wtab[0] * point0 + wtab[1] * point1 + wtab[2] * point2 + wtab[3] * point3;
                dst[final_loc] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
            }
        }

        delete[] buf_loc;
        delete[] tab_loc;
    } // cn == 1
    else if (cn == 3) {
        int32_t *buf_loc = new int32_t[outWidth + 4];
        int16_t *short_buf = new int16_t[outWidth * 2 + outWidth + outWidth + 4];
        int16_t *xy_loc_buf = short_buf;
        int16_t *xy_float_buf = xy_loc_buf + 2 * outWidth;
        int16_t *tab_loc = xy_float_buf + outWidth;
        const uint8_t *src2 = src + inWidthStride;

        for (int32_t y = 0; y < outHeight; ++y) {
            int32_t x_count = 0;
            int32_t end_x = 0;
            int32_t final_loc_base = y * outWidthStride;

            int32x4_t off_vec = vdupq_n_s32(16);
            int16x4_t mask31 = vdup_n_s16(31);
            int16x8_t mask_mull = {1, 32, 1, 32, 1, 32, 1, 32};
            int32x4_t bdelta_vec = {bdelta[2 * y], bdelta[2 * y + 1], bdelta[2 * y], bdelta[2 * y + 1]};
            int32_t idx = 0;
            for (; idx <= outWidth - 4; idx += 4) {
                int32x4_t adelta0 = vaddq_s32(vld1q_s32(adelta + 2 * idx), off_vec);
                int32x4_t adelta1 = vaddq_s32(vld1q_s32(adelta + 2 * idx + 4), off_vec);
                //x0y0,x1y1
                int32x4_t x0y0 = vaddq_s32(adelta0, bdelta_vec);
                //x2y2,x3y3
                int32x4_t x2y2 = vaddq_s32(adelta1, bdelta_vec);
                int16x4_t x0y0sh = vshrn_n_s32(x0y0, 5);
                int16x4_t x2y2sh = vshrn_n_s32(x2y2, 5);
                int16x8_t xy_float = vcombine_s16(vand_s16(x0y0sh, mask31), vand_s16(x2y2sh, mask31));
                xy_float = vmulq_s16(xy_float, mask_mull);
                int16x8_t xy = vcombine_s16(vshrn_n_s32(x0y0, 10), vshrn_n_s32(x2y2, 10));
                int16x4_t xy_float0 = vpadd_s16(vget_low_s16(xy_float), vget_high_s16(xy_float));
                vst1q_s16(xy_loc_buf + idx * 2, xy);
                vst1_s16(xy_float_buf + idx, xy_float0);
            }
            for (; idx < outWidth; idx++) {
                int32_t new_x = adelta[2 * idx] + bdelta[2 * y] + 16;
                int32_t new_y = adelta[2 * idx + 1] + bdelta[2 * y + 1] + 16;
                int32_t new_x_full = new_x >> 5;
                int32_t new_y_full = new_y >> 5;
                xy_loc_buf[idx * 2] = (new_x >> 10);
                xy_loc_buf[idx * 2 + 1] = (new_y >> 10);
                xy_float_buf[idx] = (new_x_full & 31) + (new_y_full & 31) * 32;
            }
            for (int32_t x = 0; x < outWidth; ++x) {
                int32_t new_x_loc = xy_loc_buf[x * 2];
                int32_t new_y_loc = xy_loc_buf[x * 2 + 1];
                int32_t new_xy_float = xy_float_buf[x];
                int16_t *wtab = BilinearTab_i[new_xy_float][0];

                if ((new_x_loc >= 0) && (new_x_loc < inWidth - 1) && (new_y_loc >= 0) && (new_y_loc < inHeight - 1)) {
                    buf_loc[x] = new_y_loc * inWidthStride + new_x_loc * 3;
                    tab_loc[x] = new_xy_float;
                    x_count++;
                    end_x = x;
                } else {
                    if (borderMode == BORDER_CONSTANT) {
                        int32_t loc_buffer = new_y_loc * inWidthStride + new_x_loc * 3;
                        int32_t final_loc = final_loc_base + x * 3;

                        int32_t mask0 = new_x_loc >= 0 &&
                                        new_x_loc <= (inWidth - 1) &&
                                        new_y_loc >= 0 &&
                                        new_y_loc <= (inHeight - 1);

                        int32_t mask1 = new_x_loc >= -1 &&
                                        new_x_loc <= (inWidth - 2) &&
                                        new_y_loc >= 0 &&
                                        new_y_loc <= (inHeight - 1);

                        int32_t mask2 = new_x_loc >= 0 &&
                                        new_x_loc <= (inWidth - 1) &&
                                        new_y_loc >= -1 &&
                                        new_y_loc <= (inHeight - 2);

                        int32_t mask3 = new_x_loc >= -1 &&
                                        new_x_loc <= (inWidth - 2) &&
                                        new_y_loc >= -1 &&
                                        new_y_loc <= (inHeight - 2);

                        int32_t val_xy0 = 0;
                        int32_t val_xy1 = 0;
                        int32_t val_xy2 = 0;
                        if (mask0) {
                            val_xy0 += wtab[0] * src[loc_buffer + 0];
                            val_xy1 += wtab[0] * src[loc_buffer + 1];
                            val_xy2 += wtab[0] * src[loc_buffer + 2];
                        } else {
                            val_xy0 += wtab[0] * borderValue;
                            val_xy1 += wtab[0] * borderValue;
                            val_xy2 += wtab[0] * borderValue;
                        }
                        if (mask1) {
                            val_xy0 += wtab[1] * src[loc_buffer + 3];
                            val_xy1 += wtab[1] * src[loc_buffer + 4];
                            val_xy2 += wtab[1] * src[loc_buffer + 5];
                        } else {
                            val_xy0 += wtab[1] * borderValue;
                            val_xy1 += wtab[1] * borderValue;
                            val_xy2 += wtab[1] * borderValue;
                        }
                        if (mask2) {
                            val_xy0 += wtab[2] * src2[loc_buffer + 0];
                            val_xy1 += wtab[2] * src2[loc_buffer + 1];
                            val_xy2 += wtab[2] * src2[loc_buffer + 2];
                        } else {
                            val_xy0 += wtab[2] * borderValue;
                            val_xy1 += wtab[2] * borderValue;
                            val_xy2 += wtab[2] * borderValue;
                        }
                        if (mask3) {
                            val_xy0 += wtab[3] * src2[loc_buffer + 3];
                            val_xy1 += wtab[3] * src2[loc_buffer + 4];
                            val_xy2 += wtab[3] * src2[loc_buffer + 5];
                        } else {
                            val_xy0 += wtab[3] * borderValue;
                            val_xy1 += wtab[3] * borderValue;
                            val_xy2 += wtab[3] * borderValue;
                        }

                        dst[final_loc + 0] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
                        dst[final_loc + 1] = static_cast<uint8_t>((val_xy1 + DELTA) >> 15);
                        dst[final_loc + 2] = static_cast<uint8_t>((val_xy2 + DELTA) >> 15);
                    } else if (borderMode == BORDER_TRANSPARENT) {
                        continue;
                    } else if (borderMode == BORDER_REPLICATE) {
                        int32_t sx0 = clip(new_x_loc, 0, inWidth - 1);
                        int32_t sy0 = clip(new_y_loc, 0, inHeight - 1);
                        int32_t sx1 = clip((new_x_loc + 1), 0, inWidth - 1);
                        int32_t sy1 = clip((new_y_loc + 1), 0, inHeight - 1);

                        int32_t val_xy0 =
                            src[sy0 * inWidthStride + sx0 * 3 + 0] * wtab[0] +
                            src[sy0 * inWidthStride + sx1 * 3 + 0] * wtab[1] +
                            src[sy1 * inWidthStride + sx0 * 3 + 0] * wtab[2] +
                            src[sy1 * inWidthStride + sx1 * 3 + 0] * wtab[3];

                        int32_t val_xy1 =
                            src[sy0 * inWidthStride + sx0 * 3 + 1] * wtab[0] +
                            src[sy0 * inWidthStride + sx1 * 3 + 1] * wtab[1] +
                            src[sy1 * inWidthStride + sx0 * 3 + 1] * wtab[2] +
                            src[sy1 * inWidthStride + sx1 * 3 + 1] * wtab[3];

                        int32_t val_xy2 =
                            src[sy0 * inWidthStride + sx0 * 3 + 2] * wtab[0] +
                            src[sy0 * inWidthStride + sx1 * 3 + 2] * wtab[1] +
                            src[sy1 * inWidthStride + sx0 * 3 + 2] * wtab[2] +
                            src[sy1 * inWidthStride + sx1 * 3 + 2] * wtab[3];

                        int32_t final_loc = final_loc_base + x * 3;
                        dst[final_loc + 0] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
                        dst[final_loc + 1] = static_cast<uint8_t>((val_xy1 + DELTA) >> 15);
                        dst[final_loc + 2] = static_cast<uint8_t>((val_xy2 + DELTA) >> 15);
                    }
                }
            }

            int32_t x = end_x - x_count + 1;
            int32x4_t DELTA_vec = vdupq_n_s32(DELTA);
            uint8x8_t tb = {0, 1, 2, 4, 5, 6, 0, 0};
            uint8_t *dst_loc = dst + final_loc_base + x * 3;

            int16_t *BilinearTab_ptr = BilinearTab_i[0][0];
            int32_t simd_loop = x_count >> 2;
            int32_t cmp_flag = end_x - 4 + 1;
            if (simd_loop > 0) {
                asm volatile(
                    "subs x25, %2, #1\n\t"
                    "blt 1f\n\t"
                    "add x26, %4, %3, lsl #1\n\t"
                    "add x27, %5, %3, lsl #2\n\t"
                    "ldrsh x19, [x26]\n\t"
                    "ldrsh x20, [x26, #2]\n\t"
                    "add x26, x26, #4\n\t"
                    "ldpsw x21, x22, [x27], #8\n\t"
                    "add x19, %6, x19, lsl #3\n\t"
                    "add x20, %6, x20, lsl #3\n\t"
                    "0:\n\t"
                    "ldrsh x23, [x26]\n\t"
                    "ldrsh x24, [x26, #2]\n\t"
                    "add x26, x26, #4\n\t"
                    //vec00 vec01 vec10 vec11
                    "ldr d2, [%7, x21]\n\t"
                    "ldr d4, [%7, x22]\n\t"
                    "ldr d3, [%8, x21]\n\t"
                    "ldr d5, [%8, x22]\n\t"
                    //wtab0 and wtab1
                    "ld1 {v0.4h}, [x19]\n\t"
                    "ld1 {v1.4h}, [x20]\n\t"
                    "ldpsw x21, x22, [x27], #8\n\t"
                    "add x23, %6, x23, lsl #3\n\t"
                    "add x24, %6, x24, lsl #3\n\t"
                    //calculation of vec00,01,10,11
                    "ushll v2.8h, v2.8b, #0\n\t"
                    "ushll v3.8h, v3.8b, #0\n\t"
                    "ushll v4.8h, v4.8b, #0\n\t"
                    "ushll v5.8h, v5.8b, #0\n\t"
                    "mov v6.d[0], v2.d[1]\n\t"
                    "mov v7.d[0], v3.d[1]\n\t"
                    "mov v8.d[0], v4.d[1]\n\t"
                    "mov v9.d[0], v5.d[1]\n\t"
                    //vec20 vec21 vec30 vec31
                    "ldr d16, [%7, x21]\n\t"
                    "ldr d18, [%7, x22]\n\t"
                    "ldr d17, [%8, x21]\n\t"
                    "ldr d19, [%8, x22]\n\t"
                    "ext v6.8b, v2.8b, v6.8b, #6\n\t"
                    "ext v7.8b, v3.8b, v7.8b, #6\n\t"
                    "ext v8.8b, v4.8b, v8.8b, #6\n\t"
                    "ext v9.8b, v5.8b, v9.8b, #6\n\t"
                    //wtab2 and wtab3
                    "ld1 {v14.4h}, [x23]\n\t"
                    "ld1 {v15.4h}, [x24]\n\t"
                    "smull v10.4s, v2.4h, v0.h[0]\n\t"
                    "smull v11.4s, v3.4h, v0.h[2]\n\t"
                    "smull v12.4s, v4.4h, v1.h[0]\n\t"
                    "smull v13.4s, v5.4h, v1.h[2]\n\t"
                    "smlal v10.4s, v6.4h, v0.h[1]\n\t"
                    "smlal v11.4s, v7.4h, v0.h[3]\n\t"
                    "smlal v12.4s, v8.4h, v1.h[1]\n\t"
                    "smlal v13.4s, v9.4h, v1.h[3]\n\t"
                    //next loop
                    "ldrsh x19, [x26]\n\t"
                    "ldrsh x20, [x26, #2]\n\t"
                    "add x26, x26, #4\n\t"
                    "ldpsw x21, x22, [x27], #8\n\t"
                    //calculation of vec00,01,10,11
                    "ushll v16.8h, v16.8b, #0\n\t"
                    "ushll v17.8h, v17.8b, #0\n\t"
                    "ushll v18.8h, v18.8b, #0\n\t"
                    "ushll v19.8h, v19.8b, #0\n\t"
                    "add x19, %6, x19, lsl #3\n\t"
                    "add x20, %6, x20, lsl #3\n\t"
                    "mov v6.d[0], v16.d[1]\n\t"
                    "mov v7.d[0], v17.d[1]\n\t"
                    "mov v8.d[0], v18.d[1]\n\t"
                    "mov v9.d[0], v19.d[1]\n\t"
                    "ext v6.8b, v16.8b, v6.8b, #6\n\t"
                    "ext v7.8b, v17.8b, v7.8b, #6\n\t"
                    "ext v8.8b, v18.8b, v8.8b, #6\n\t"
                    "ext v9.8b, v19.8b, v9.8b, #6\n\t"
                    "smull v20.4s, v16.4h, v14.h[0]\n\t"
                    "smull v21.4s, v17.4h, v14.h[2]\n\t"
                    "smull v22.4s, v18.4h, v15.h[0]\n\t"
                    "smull v23.4s, v19.4h, v15.h[2]\n\t"
                    "smlal v20.4s, v6.4h, v14.h[1]\n\t"
                    "smlal v21.4s, v7.4h, v14.h[3]\n\t"
                    "smlal v22.4s, v8.4h, v15.h[1]\n\t"
                    "smlal v23.4s, v9.4h, v15.h[3]\n\t"
                    //results calculation
                    "add v10.4s, v10.4s, v11.4s\n\t"
                    "add v12.4s, v12.4s, v13.4s\n\t"
                    "add v20.4s, v20.4s, v21.4s\n\t"
                    "add v22.4s, v22.4s, v23.4s\n\t"
                    "add v10.4s, v10.4s, %9.4s\n\t"
                    "add v12.4s, v12.4s, %9.4s\n\t"
                    "add v20.4s, v20.4s, %9.4s\n\t"
                    "add v22.4s, v22.4s, %9.4s\n\t"
                    "shrn v10.4h, v10.4s, #15\n\t"
                    "shrn v11.4h, v12.4s, #15\n\t"
                    "shrn v12.4h, v20.4s, #15\n\t"
                    "shrn v13.4h, v22.4s, #15\n\t"
                    "ins v10.d[1], v11.d[0]\n\t"
                    "ins v12.d[1], v13.d[0]\n\t"
                    "sqxtun v10.8b, v10.8h\n\t"
                    "sqxtun v12.8b, v12.8h\n\t"
                    "subs x25, x25, #1\n\t"
                    "tbl v10.8b, {v10.16b}, %10.8b\n\t"
                    "tbl v12.8b, {v12.16b}, %10.8b\n\t"
                    "st1 {v10.s}[0], [%0], #4\n\t"
                    "st1 {v10.h}[2], [%0], #2\n\t"
                    "st1 {v12.s}[0], [%0], #4\n\t"
                    "st1 {v12.h}[2], [%0], #2\n\t"
                    "bge 0b\n\t"
                    "1:\n\t"
                    : "=r"(dst_loc)
                    : "0"(dst_loc), "r"(simd_loop), "r"(x), "r"(tab_loc), "r"(buf_loc), "r"(BilinearTab_ptr), "r"(src), "r"(src2), "w"(DELTA_vec), "w"(tb), "r"(cmp_flag)
                    : "cc", "memory", "x26", "x27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
                x = x + (simd_loop << 2);
            }

            for (; x <= end_x; x++) {
                int32_t final_loc = final_loc_base + x * 3;
                int32_t loc_buffer = buf_loc[x];
                int16_t *wtab = BilinearTab_i[tab_loc[x]][0];
                int32_t point00 = src[loc_buffer];
                int32_t point01 = src[loc_buffer + 1];
                int32_t point02 = src[loc_buffer + 2];
                int32_t point03 = src[loc_buffer + 3];
                int32_t point04 = src[loc_buffer + 4];
                int32_t point05 = src[loc_buffer + 5];
                int32_t point10 = src2[loc_buffer];
                int32_t point11 = src2[loc_buffer + 1];
                int32_t point12 = src2[loc_buffer + 2];
                int32_t point13 = src2[loc_buffer + 3];
                int32_t point14 = src2[loc_buffer + 4];
                int32_t point15 = src2[loc_buffer + 5];

                int32_t val_xy0 = wtab[0] * point00 + wtab[1] * point03 + wtab[2] * point10 + wtab[3] * point13;
                int32_t val_xy1 = wtab[0] * point01 + wtab[1] * point04 + wtab[2] * point11 + wtab[3] * point14;
                int32_t val_xy2 = wtab[0] * point02 + wtab[1] * point05 + wtab[2] * point12 + wtab[3] * point15;
                dst[final_loc + 0] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
                dst[final_loc + 1] = static_cast<uint8_t>((val_xy1 + DELTA) >> 15);
                dst[final_loc + 2] = static_cast<uint8_t>((val_xy2 + DELTA) >> 15);
            }
        }

        delete[] buf_loc;
        delete[] short_buf;
    } // cn == 3
    else if (cn == 4) {
        int32_t *buf_loc = new int32_t[outWidth + 4];
        int16_t *short_buf = new int16_t[outWidth * 2 + outWidth + outWidth + 4];
        int16_t *xy_loc_buf = short_buf;
        int16_t *xy_float_buf = xy_loc_buf + 2 * outWidth;
        int16_t *tab_loc = xy_float_buf + outWidth;
        const uint8_t *src2 = src + inWidthStride;

        int32x4_t off_vec = vdupq_n_s32(16);
        int16x4_t mask31 = vdup_n_s16(31);
        int16x8_t mask_mull = {1, 32, 1, 32, 1, 32, 1, 32};
        for (int32_t y = 0; y < outHeight; ++y) {
            int32_t x_count = 0;
            int32_t end_x = 0;
            int32_t final_loc_base = y * outWidthStride;

            int32x4_t bdelta_vec = {bdelta[2 * y], bdelta[2 * y + 1], bdelta[2 * y], bdelta[2 * y + 1]};
            int32_t idx = 0;
            for (; idx <= outWidth - 4; idx += 4) {
                int32x4_t adelta0 = vaddq_s32(vld1q_s32(adelta + 2 * idx), off_vec);
                int32x4_t adelta1 = vaddq_s32(vld1q_s32(adelta + 2 * idx + 4), off_vec);
                //x0y0,x1y1
                int32x4_t x0y0 = vaddq_s32(adelta0, bdelta_vec);
                //x2y2,x3y3
                int32x4_t x2y2 = vaddq_s32(adelta1, bdelta_vec);
                int16x4_t x0y0sh = vshrn_n_s32(x0y0, 5);
                int16x4_t x2y2sh = vshrn_n_s32(x2y2, 5);
                int16x8_t xy_float = vcombine_s16(vand_s16(x0y0sh, mask31), vand_s16(x2y2sh, mask31));
                xy_float = vmulq_s16(xy_float, mask_mull);
                int16x8_t xy = vcombine_s16(vshrn_n_s32(x0y0, 10), vshrn_n_s32(x2y2, 10));
                int16x4_t xy_float0 = vpadd_s16(vget_low_s16(xy_float), vget_high_s16(xy_float));
                vst1q_s16(xy_loc_buf + idx * 2, xy);
                vst1_s16(xy_float_buf + idx, xy_float0);
            }
            for (; idx < outWidth; idx++) {
                int32_t new_x = adelta[2 * idx] + bdelta[2 * y] + 16;
                int32_t new_y = adelta[2 * idx + 1] + bdelta[2 * y + 1] + 16;
                int32_t new_x_full = new_x >> 5;
                int32_t new_y_full = new_y >> 5;
                xy_loc_buf[idx * 2] = (new_x >> 10);
                xy_loc_buf[idx * 2 + 1] = (new_y >> 10);
                xy_float_buf[idx] = (new_x_full & 31) + (new_y_full & 31) * 32;
            }
            for (int32_t x = 0; x < outWidth; ++x) {
                int32_t new_x_loc = xy_loc_buf[x * 2];
                int32_t new_y_loc = xy_loc_buf[x * 2 + 1];
                int32_t new_xy_float = xy_float_buf[x];
                int16_t *wtab = BilinearTab_i[new_xy_float][0];

                if ((new_x_loc >= 0) && (new_x_loc < inWidth - 1) && (new_y_loc >= 0) && (new_y_loc < inHeight - 1)) {
                    buf_loc[x] = new_y_loc * inWidthStride + new_x_loc * 4;
                    tab_loc[x] = new_xy_float;
                    x_count++;
                    end_x = x;
                } else {
                    if (borderMode == BORDER_CONSTANT) {
                        int32_t loc_buffer = new_y_loc * inWidthStride + new_x_loc * 4;
                        int32_t final_loc = final_loc_base + x * 4;

                        int32_t mask0 = new_x_loc >= 0 &&
                                        new_x_loc <= (inWidth - 1) &&
                                        new_y_loc >= 0 &&
                                        new_y_loc <= (inHeight - 1);

                        int32_t mask1 = new_x_loc >= -1 &&
                                        new_x_loc <= (inWidth - 2) &&
                                        new_y_loc >= 0 &&
                                        new_y_loc <= (inHeight - 1);

                        int32_t mask2 = new_x_loc >= 0 &&
                                        new_x_loc <= (inWidth - 1) &&
                                        new_y_loc >= -1 &&
                                        new_y_loc <= (inHeight - 2);

                        int32_t mask3 = new_x_loc >= -1 &&
                                        new_x_loc <= (inWidth - 2) &&
                                        new_y_loc >= -1 &&
                                        new_y_loc <= (inHeight - 2);

                        int32_t val_xy0 = 0;
                        int32_t val_xy1 = 0;
                        int32_t val_xy2 = 0;
                        int32_t val_xy3 = 0;
                        if (mask0) {
                            val_xy0 += wtab[0] * src[loc_buffer + 0];
                            val_xy1 += wtab[0] * src[loc_buffer + 1];
                            val_xy2 += wtab[0] * src[loc_buffer + 2];
                            val_xy3 += wtab[0] * src[loc_buffer + 3];
                        } else {
                            val_xy0 += wtab[0] * borderValue;
                            val_xy1 += wtab[0] * borderValue;
                            val_xy2 += wtab[0] * borderValue;
                            val_xy3 += wtab[0] * borderValue;
                        }
                        if (mask1) {
                            val_xy0 += wtab[1] * src[loc_buffer + 4];
                            val_xy1 += wtab[1] * src[loc_buffer + 5];
                            val_xy2 += wtab[1] * src[loc_buffer + 6];
                            val_xy3 += wtab[1] * src[loc_buffer + 7];
                        } else {
                            val_xy0 += wtab[1] * borderValue;
                            val_xy1 += wtab[1] * borderValue;
                            val_xy2 += wtab[1] * borderValue;
                            val_xy3 += wtab[1] * borderValue;
                        }
                        if (mask2) {
                            val_xy0 += wtab[2] * src2[loc_buffer + 0];
                            val_xy1 += wtab[2] * src2[loc_buffer + 1];
                            val_xy2 += wtab[2] * src2[loc_buffer + 2];
                            val_xy3 += wtab[2] * src2[loc_buffer + 3];
                        } else {
                            val_xy0 += wtab[2] * borderValue;
                            val_xy1 += wtab[2] * borderValue;
                            val_xy2 += wtab[2] * borderValue;
                            val_xy3 += wtab[2] * borderValue;
                        }
                        if (mask3) {
                            val_xy0 += wtab[3] * src2[loc_buffer + 4];
                            val_xy1 += wtab[3] * src2[loc_buffer + 5];
                            val_xy2 += wtab[3] * src2[loc_buffer + 6];
                            val_xy3 += wtab[3] * src2[loc_buffer + 7];
                        } else {
                            val_xy0 += wtab[3] * borderValue;
                            val_xy1 += wtab[3] * borderValue;
                            val_xy2 += wtab[3] * borderValue;
                            val_xy3 += wtab[3] * borderValue;
                        }

                        dst[final_loc + 0] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
                        dst[final_loc + 1] = static_cast<uint8_t>((val_xy1 + DELTA) >> 15);
                        dst[final_loc + 2] = static_cast<uint8_t>((val_xy2 + DELTA) >> 15);
                        dst[final_loc + 3] = static_cast<uint8_t>((val_xy3 + DELTA) >> 15);
                    } else if (borderMode == BORDER_TRANSPARENT) {
                        continue;
                    } else if (borderMode == BORDER_REPLICATE) {
                        int32_t sx0 = clip(new_x_loc, 0, inWidth - 1);
                        int32_t sy0 = clip(new_y_loc, 0, inHeight - 1);
                        int32_t sx1 = clip((new_x_loc + 1), 0, inWidth - 1);
                        int32_t sy1 = clip((new_y_loc + 1), 0, inHeight - 1);

                        int32_t val_xy0 =
                            src[sy0 * inWidthStride + sx0 * 4 + 0] * wtab[0] +
                            src[sy0 * inWidthStride + sx1 * 4 + 0] * wtab[1] +
                            src[sy1 * inWidthStride + sx0 * 4 + 0] * wtab[2] +
                            src[sy1 * inWidthStride + sx1 * 4 + 0] * wtab[3];

                        int32_t val_xy1 =
                            src[sy0 * inWidthStride + sx0 * 4 + 1] * wtab[0] +
                            src[sy0 * inWidthStride + sx1 * 4 + 1] * wtab[1] +
                            src[sy1 * inWidthStride + sx0 * 4 + 1] * wtab[2] +
                            src[sy1 * inWidthStride + sx1 * 4 + 1] * wtab[3];

                        int32_t val_xy2 =
                            src[sy0 * inWidthStride + sx0 * 4 + 2] * wtab[0] +
                            src[sy0 * inWidthStride + sx1 * 4 + 2] * wtab[1] +
                            src[sy1 * inWidthStride + sx0 * 4 + 2] * wtab[2] +
                            src[sy1 * inWidthStride + sx1 * 4 + 2] * wtab[3];

                        int32_t val_xy3 =
                            src[sy0 * inWidthStride + sx0 * 4 + 3] * wtab[0] +
                            src[sy0 * inWidthStride + sx1 * 4 + 3] * wtab[1] +
                            src[sy1 * inWidthStride + sx0 * 4 + 3] * wtab[2] +
                            src[sy1 * inWidthStride + sx1 * 4 + 3] * wtab[3];

                        int32_t final_loc = final_loc_base + x * 4;
                        dst[final_loc + 0] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
                        dst[final_loc + 1] = static_cast<uint8_t>((val_xy1 + DELTA) >> 15);
                        dst[final_loc + 2] = static_cast<uint8_t>((val_xy2 + DELTA) >> 15);
                        dst[final_loc + 3] = static_cast<uint8_t>((val_xy3 + DELTA) >> 15);
                    }
                }
            }

            int32_t x = end_x - x_count + 1;
            int32x4_t DELTA_vec = vdupq_n_s32(DELTA);
            uint8_t *dst_loc = dst + final_loc_base + x * 4;
            int16_t *BilinearTab_ptr = BilinearTab_i[0][0];
            int32_t simd_loop = x_count >> 2;
            if (simd_loop > 0) {
                asm volatile(
                    "subs x25, %2, #1\n\t"
                    "blt 1f\n\t"
                    "add x14, %4, %3, lsl #1\n\t"
                    "add x15, %5, %3, lsl #2\n\t"
                    "ldrsh x19, [x14]\n\t"
                    "ldrsh x20, [x14, #2]\n\t"
                    "add x14, x14, #4\n\t"
                    "ldpsw x21, x22, [x15], #8\n\t"
                    "ldpsw x26, x27, [x15], #8\n\t"
                    "add x19, %6, x19, lsl #3\n\t"
                    "add x20, %6, x20, lsl #3\n\t"
                    "0:\n\t"
                    "ldrsh x23, [x14]\n\t"
                    "ldrsh x24, [x14, #2]\n\t"
                    "add x14, x14, #4\n\t"
                    //vec00 vec01 vec10 vec11
                    //vec20 vec21 vec30 vec31
                    "ldr d2, [%7, x21]\n\t"
                    "ldr d4, [%7, x22]\n\t"
                    "ldr d16, [%7, x26]\n\t"
                    "ldr d18, [%7, x27]\n\t"
                    "ldr d3, [%8, x21]\n\t"
                    "ldr d5, [%8, x22]\n\t"
                    "ldr d17, [%8, x26]\n\t"
                    "ldr d19, [%8, x27]\n\t"
                    //wtab0 and wtab1
                    "ld1 {v0.4h}, [x19]\n\t"
                    "ld1 {v1.4h}, [x20]\n\t"
                    "ldpsw x21, x22, [x15], #8\n\t"
                    "add x23, %6, x23, lsl #3\n\t"
                    "add x24, %6, x24, lsl #3\n\t"
                    //calculation of vec00,01,10,11
                    "ushll v2.8h, v2.8b, #0\n\t"
                    "ushll v3.8h, v3.8b, #0\n\t"
                    "ushll v4.8h, v4.8b, #0\n\t"
                    "ushll v5.8h, v5.8b, #0\n\t"
                    "mov v6.d[0], v2.d[1]\n\t"
                    "mov v7.d[0], v3.d[1]\n\t"
                    "mov v8.d[0], v4.d[1]\n\t"
                    "mov v9.d[0], v5.d[1]\n\t"
                    //vec20 vec21 vec30 vec31
                    "#ldr d16, [%7, x26]\n\t"
                    "#ldr d18, [%7, x27]\n\t"
                    "#ldr d17, [%8, x26]\n\t"
                    "#ldr d19, [%8, x27]\n\t"
                    //wtab2 and wtab3
                    "ld1 {v14.4h}, [x23]\n\t"
                    "ld1 {v15.4h}, [x24]\n\t"
                    "smull v10.4s, v2.4h, v0.h[0]\n\t"
                    "smull v11.4s, v3.4h, v0.h[2]\n\t"
                    "smull v12.4s, v4.4h, v1.h[0]\n\t"
                    "smull v13.4s, v5.4h, v1.h[2]\n\t"
                    "smlal v10.4s, v6.4h, v0.h[1]\n\t"
                    "smlal v11.4s, v7.4h, v0.h[3]\n\t"
                    "smlal v12.4s, v8.4h, v1.h[1]\n\t"
                    "smlal v13.4s, v9.4h, v1.h[3]\n\t"
                    //next loop
                    "ldrsh x19, [x14]\n\t"
                    "ldrsh x20, [x14, #2]\n\t"
                    "add x14, x14, #4\n\t"
                    "ldpsw x26, x27, [x15], #8\n\t"
                    //calculation of vec00,01,10,11
                    "ushll v16.8h, v16.8b, #0\n\t"
                    "ushll v17.8h, v17.8b, #0\n\t"
                    "ushll v18.8h, v18.8b, #0\n\t"
                    "ushll v19.8h, v19.8b, #0\n\t"
                    "add x19, %6, x19, lsl #3\n\t"
                    "add x20, %6, x20, lsl #3\n\t"
                    "mov v6.d[0], v16.d[1]\n\t"
                    "prfm pldl3keep, [%7, x21]\n\t"
                    "prfm pldl3keep, [%7, x22]\n\t"
                    "mov v7.d[0], v17.d[1]\n\t"
                    "prfm pldl3keep, [%7, x26]\n\t"
                    "prfm pldl3keep, [%7, x27]\n\t"
                    "mov v8.d[0], v18.d[1]\n\t"
                    "prfm pldl3keep, [%8, x21]\n\t"
                    "prfm pldl3keep, [%8, x22]\n\t"
                    "mov v9.d[0], v19.d[1]\n\t"
                    "prfm pldl3keep, [%8, x26]\n\t"
                    "prfm pldl3keep, [%8, x27]\n\t"
                    "smull v20.4s, v16.4h, v14.h[0]\n\t"
                    "smull v21.4s, v17.4h, v14.h[2]\n\t"
                    "smull v22.4s, v18.4h, v15.h[0]\n\t"
                    "smull v23.4s, v19.4h, v15.h[2]\n\t"
                    "smlal v20.4s, v6.4h, v14.h[1]\n\t"
                    "smlal v21.4s, v7.4h, v14.h[3]\n\t"
                    "smlal v22.4s, v8.4h, v15.h[1]\n\t"
                    "smlal v23.4s, v9.4h, v15.h[3]\n\t"
                    //results calculation
                    "add v10.4s, v10.4s, v11.4s\n\t"
                    "add v12.4s, v12.4s, v13.4s\n\t"
                    "add v20.4s, v20.4s, v21.4s\n\t"
                    "add v22.4s, v22.4s, v23.4s\n\t"
                    "add v10.4s, v10.4s, %9.4s\n\t"
                    "add v12.4s, v12.4s, %9.4s\n\t"
                    "add v20.4s, v20.4s, %9.4s\n\t"
                    "add v22.4s, v22.4s, %9.4s\n\t"
                    "shrn v10.4h, v10.4s, #15\n\t"
                    "shrn v11.4h, v12.4s, #15\n\t"
                    "shrn v12.4h, v20.4s, #15\n\t"
                    "shrn v13.4h, v22.4s, #15\n\t"
                    "ins v10.d[1], v11.d[0]\n\t"
                    "ins v12.d[1], v13.d[0]\n\t"
                    "sqxtun v10.8b, v10.8h\n\t"
                    "sqxtun v12.8b, v12.8h\n\t"
                    "subs x25, x25, #1\n\t"
                    "st1 {v10.8b}, [%0], #8\n\t"
                    "st1 {v12.8b}, [%0], #8\n\t"
                    "bge 0b\n\t"
                    "1:\n\t"
                    : "=r"(dst_loc)
                    : "0"(dst_loc), "r"(simd_loop), "r"(x), "r"(tab_loc), "r"(buf_loc), "r"(BilinearTab_ptr), "r"(src), "r"(src2), "w"(DELTA_vec)
                    : "cc", "memory", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
                x = x + (simd_loop << 2);
            }
            for (; x <= end_x; x++) {
                int32_t final_loc = final_loc_base + x * 4;
                int32_t loc_buffer = buf_loc[x];
                int16_t *wtab = BilinearTab_i[tab_loc[x]][0];

                int32_t val_xy0 = wtab[0] * src[loc_buffer + 0] +
                                  wtab[1] * src[loc_buffer + 4] +
                                  wtab[2] * src2[loc_buffer + 0] +
                                  wtab[3] * src2[loc_buffer + 4];
                int32_t val_xy1 = wtab[0] * src[loc_buffer + 1] +
                                  wtab[1] * src[loc_buffer + 5] +
                                  wtab[2] * src2[loc_buffer + 1] +
                                  wtab[3] * src2[loc_buffer + 5];
                int32_t val_xy2 = wtab[0] * src[loc_buffer + 2] +
                                  wtab[1] * src[loc_buffer + 6] +
                                  wtab[2] * src2[loc_buffer + 2] +
                                  wtab[3] * src2[loc_buffer + 6];
                int32_t val_xy3 = wtab[0] * src[loc_buffer + 3] +
                                  wtab[1] * src[loc_buffer + 7] +
                                  wtab[2] * src2[loc_buffer + 3] +
                                  wtab[3] * src2[loc_buffer + 7];

                dst[final_loc + 0] = static_cast<uint8_t>((val_xy0 + DELTA) >> 15);
                dst[final_loc + 1] = static_cast<uint8_t>((val_xy1 + DELTA) >> 15);
                dst[final_loc + 2] = static_cast<uint8_t>((val_xy2 + DELTA) >> 15);
                dst[final_loc + 3] = static_cast<uint8_t>((val_xy3 + DELTA) >> 15);
            }
        }

        delete[] buf_loc;
        delete[] short_buf;
    } // cn == 4
    free(buffer);
}

template <>
::ppl::common::RetCode WarpAffineNearestPoint<uint8_t, 1>(
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    const uint8_t *inData,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    uint8_t *outData,
    const float *affineMatrix,
    BorderType border_type,
    uint8_t borderValue)
{
    if (inData == nullptr || outData == nullptr || affineMatrix == nullptr) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (inHeight <= 0 || inWidth <= 0 || inWidthStride < inWidth || outHeight <= 0 || outWidth <= 0 || outWidthStride < outWidth) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (border_type != BORDER_CONSTANT && border_type != BORDER_REPLICATE && border_type != BORDER_TRANSPARENT) {
        return ppl::common::RC_INVALID_VALUE;
    }
    warpAffine_nearest<uint8_t, 1>(outData, inData, inHeight, inWidth, inWidthStride, outHeight, outWidth, outWidthStride, affineMatrix, border_type, borderValue);
    return ppl::common::RC_SUCCESS;
}

template <>
::ppl::common::RetCode WarpAffineNearestPoint<uint8_t, 3>(
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    const uint8_t *inData,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    uint8_t *outData,
    const float *affineMatrix,
    BorderType border_type,
    uint8_t borderValue)
{
    if (inData == nullptr || outData == nullptr || affineMatrix == nullptr) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (inHeight <= 0 || inWidth <= 0 || inWidthStride < inWidth || outHeight <= 0 || outWidth <= 0 || outWidthStride < outWidth) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (border_type != BORDER_CONSTANT && border_type != BORDER_REPLICATE && border_type != BORDER_TRANSPARENT) {
        return ppl::common::RC_INVALID_VALUE;
    }
    warpAffine_nearest<uint8_t, 3>(outData, inData, inHeight, inWidth, inWidthStride, outHeight, outWidth, outWidthStride, affineMatrix, border_type, borderValue);
    return ppl::common::RC_SUCCESS;
}

template <>
::ppl::common::RetCode WarpAffineNearestPoint<uint8_t, 4>(
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    const uint8_t *inData,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    uint8_t *outData,
    const float *affineMatrix,
    BorderType border_type,
    uint8_t borderValue)
{
    if (inData == nullptr || outData == nullptr || affineMatrix == nullptr) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (inHeight <= 0 || inWidth <= 0 || inWidthStride < inWidth || outHeight <= 0 || outWidth <= 0 || outWidthStride < outWidth) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (border_type != BORDER_CONSTANT && border_type != BORDER_REPLICATE && border_type != BORDER_TRANSPARENT) {
        return ppl::common::RC_INVALID_VALUE;
    }
    warpAffine_nearest<uint8_t, 4>(outData, inData, inHeight, inWidth, inWidthStride, outHeight, outWidth, outWidthStride, affineMatrix, border_type, borderValue);
    return ppl::common::RC_SUCCESS;
}

template <>
::ppl::common::RetCode WarpAffineLinear<uint8_t, 1>(
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    const uint8_t *inData,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    uint8_t *outData,
    const float *affineMatrix,
    BorderType border_type,
    uint8_t borderValue)
{
    if (inData == nullptr || outData == nullptr || affineMatrix == nullptr) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (inHeight <= 0 || inWidth <= 0 || inWidthStride < inWidth || outHeight <= 0 || outWidth <= 0 || outWidthStride < outWidth) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (border_type != BORDER_CONSTANT && border_type != BORDER_REPLICATE && border_type != BORDER_TRANSPARENT) {
        return ppl::common::RC_INVALID_VALUE;
    }
    warpAffine_linear_uint8_t(outData, inData, inHeight, inWidth, inWidthStride, outHeight, outWidth, outWidthStride, affineMatrix, 1, border_type, borderValue);
    return ppl::common::RC_SUCCESS;
}

template <>
::ppl::common::RetCode WarpAffineLinear<uint8_t, 2>(
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    const uint8_t *inData,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    uint8_t *outData,
    const float *affineMatrix,
    BorderType border_type,
    uint8_t borderValue)
{
    if (inData == nullptr || outData == nullptr || affineMatrix == nullptr) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (inHeight <= 0 || inWidth <= 0 || inWidthStride < inWidth || outHeight <= 0 || outWidth <= 0 || outWidthStride < outWidth) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (border_type != BORDER_CONSTANT && border_type != BORDER_REPLICATE && border_type != BORDER_TRANSPARENT) {
        return ppl::common::RC_INVALID_VALUE;
    }
    warpAffine_linear_uint8_t(outData, inData, inHeight, inWidth, inWidthStride, outHeight, outWidth, outWidthStride, affineMatrix, 2, border_type, borderValue);
    return ppl::common::RC_SUCCESS;
}

template <>
::ppl::common::RetCode WarpAffineLinear<uint8_t, 3>(
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    const uint8_t *inData,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    uint8_t *outData,
    const float *affineMatrix,
    BorderType border_type,
    uint8_t borderValue)
{
    if (inData == nullptr || outData == nullptr || affineMatrix == nullptr) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (inHeight <= 0 || inWidth <= 0 || inWidthStride < inWidth || outHeight <= 0 || outWidth <= 0 || outWidthStride < outWidth) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (border_type != BORDER_CONSTANT && border_type != BORDER_REPLICATE && border_type != BORDER_TRANSPARENT) {
        return ppl::common::RC_INVALID_VALUE;
    }
    warpAffine_linear_uint8_t(outData, inData, inHeight, inWidth, inWidthStride, outHeight, outWidth, outWidthStride, affineMatrix, 3, border_type, borderValue);
    return ppl::common::RC_SUCCESS;
}

template <>
::ppl::common::RetCode WarpAffineLinear<uint8_t, 4>(
    int32_t inHeight,
    int32_t inWidth,
    int32_t inWidthStride,
    const uint8_t *inData,
    int32_t outHeight,
    int32_t outWidth,
    int32_t outWidthStride,
    uint8_t *outData,
    const float *affineMatrix,
    BorderType border_type,
    uint8_t borderValue)
{
    if (inData == nullptr || outData == nullptr || affineMatrix == nullptr) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (inHeight <= 0 || inWidth <= 0 || inWidthStride < inWidth || outHeight <= 0 || outWidth <= 0 || outWidthStride < outWidth) {
        return ppl::common::RC_INVALID_VALUE;
    }
    if (border_type != BORDER_CONSTANT && border_type != BORDER_REPLICATE && border_type != BORDER_TRANSPARENT) {
        return ppl::common::RC_INVALID_VALUE;
    }
    warpAffine_linear_uint8_t(outData, inData, inHeight, inWidth, inWidthStride, outHeight, outWidth, outWidthStride, affineMatrix, 4, border_type, borderValue);
    return ppl::common::RC_SUCCESS;
}

}
}
} // namespace ppl::cv::arm
